In [5]:
# this is an example of iterating over all zst files in a single folder,
# decompressing them and reading the created_utc field to make sure the files
# are intact. It has no output other than the number of lines

import zstandard
import os
import json
import sys
from datetime import datetime
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()


input_folder = r'C:\Users\ninuy\Downloads\reddit\subreddits24'
input_files = []
total_size = 0
for subdir, dirs, files in os.walk(input_folder):
	for filename in files:
		input_path = os.path.join(subdir, filename)
		if input_path.endswith(".zst"):
			file_size = os.stat(input_path).st_size
			total_size += file_size
			input_files.append([input_path, file_size])

log.info(f"Processing {len(input_files)} files of {(total_size / (2**30)):.2f} gigabytes")

total_lines = 0
total_bytes_processed = 0
for input_file in input_files:
	file_lines = 0
	file_bytes_processed = 0
	created = None
	for line, file_bytes_processed in read_lines_zst(input_file[0]):
		obj = json.loads(line)
		created = datetime.utcfromtimestamp(int(obj['created_utc']))
		file_lines += 1
		if file_lines == 1:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%")
		if file_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%")
	total_lines += file_lines
	total_bytes_processed += input_file[1]
	log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%")

log.info(f"Total: {total_lines}")

Processing 5 files of 0.92 gigabytes
2012-01-14 04:02:34 : 1 : 0% : 0%
2022-08-13 19:03:25 : 100,000 : 47% : 0%
2024-03-03 01:10:33 : 200,000 : 78% : 0%
2024-12-31 23:20:44 : 246,857 : 100% : 10%
2016-03-06 17:13:12 : 246,858 : 0% : 10%
2024-12-31 23:13:14 : 285,697 : 100% : 12%
2016-02-18 15:54:05 : 285,698 : 0% : 12%
2024-12-31 20:06:21 : 346,033 : 100% : 13%
2008-12-06 20:38:10 : 346,034 : 0% : 13%
2013-05-21 17:21:30 : 446,033 : 7% : 13%
2014-06-03 18:16:13 : 546,033 : 10% : 13%
2015-03-28 01:07:16 : 646,033 : 14% : 13%
2015-12-16 05:44:11 : 746,033 : 17% : 13%
2016-07-07 03:42:36 : 846,033 : 21% : 13%
2017-03-12 21:58:40 : 946,033 : 28% : 13%
2017-07-23 05:17:49 : 1,046,033 : 30% : 13%
2018-01-01 04:59:36 : 1,146,033 : 33% : 13%
2018-05-07 04:52:02 : 1,246,033 : 37% : 13%
2018-08-07 01:12:57 : 1,346,033 : 41% : 13%
2019-01-17 04:10:55 : 1,446,033 : 45% : 13%
2019-05-11 03:35:39 : 1,546,033 : 48% : 13%
2019-07-19 18:15:53 : 1,646,033 : 52% : 13%
2020-01-14 22:46:38 : 1,746,033 : 58

In [None]:
import zstandard as zstd
import json

file_path = "RC_2023-12.zst"  # Replace with your file name

with open(file_path, 'rb') as compressed_file:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(compressed_file) as reader:
        for line in reader:
            try:
                record = json.loads(line)
                # Example: print the comment body and subreddit
                print(record.get("subreddit"), record.get("body"))
            except json.JSONDecodeError:
                continue  # Skip bad lines


In [6]:
import os
import json
import zstandard as zstd
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# === SETTINGS ===
input_folder = r"C:\Users\ninuy\DSC 672\Data\reddit\subreddits24"      # <-- Change this to your input folder
output_folder = r"C:\Users\ninuy\DSC 672\Data"  # <-- Change this to your output folder
os.makedirs(output_folder, exist_ok=True)

analyzer = SentimentIntensityAnalyzer()

# Full list of NBA teams
nba_teams = [
    "Hawks", "Celtics", "Nets", "Hornets", "Bulls",
    "Cavaliers", "Mavericks", "Nuggets", "Pistons", "Warriors",
    "Rockets", "Pacers", "Clippers", "Lakers", "Grizzlies",
    "Heat", "Bucks", "Timberwolves", "Pelicans", "Knicks",
    "Thunder", "Magic", "76ers", "Suns", "Trail Blazers",
    "Kings", "Spurs", "Raptors", "Jazz", "Wizards"
]

def is_relevant(text):
    return any(team.lower() in text.lower() for team in nba_teams)

# Unix timestamp for Jan 1, 2017
min_timestamp = 1483228800

# === PROCESS EACH FILE ===
for filename in os.listdir(input_folder):
    if filename.endswith(".zst"):
        input_path = os.path.join(input_folder, filename)
        print(f"Processing: {filename}")

        results = []
        with open(input_path, 'rb') as compressed:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(compressed) as reader:
                for line in reader:
                    try:
                        record = json.loads(line)

                        # Skip if too old
                        created_utc = record.get("created_utc", 0)
                        if created_utc < min_timestamp:
                            continue

                        # Only for submissions (title + selftext)
                        title = record.get("title", "")
                        selftext = record.get("selftext", "")
                        text = f"{title} {selftext}".strip()

                        # Skip removed or deleted content
                        if not text or text in ["[removed]", "[deleted]"]:
                            continue

                        # Skip if not NBA-related
                        if not is_relevant(text):
                            continue

                        sentiment = analyzer.polarity_scores(text)

                        results.append({
                            "title": title,
                            "selftext": selftext,
                            "subreddit": record.get("subreddit", ""),
                            "created_utc": created_utc,
                            "score": record.get("score", 0),
                            "compound": sentiment["compound"],
                            "positive": sentiment["pos"],
                            "neutral": sentiment["neu"],
                            "negative": sentiment["neg"]
                        })

                    except (json.JSONDecodeError, KeyError):
                        continue

        # Save results
        if results:
            df = pd.DataFrame(results)
            base_filename = os.path.splitext(filename)[0]
            output_path = os.path.join(output_folder, f"{base_filename}_sentiment.csv")
            df.to_csv(output_path, index=False)
            print(f"Saved: {output_path}")
        else:
            print("No relevant data found in:", filename)


Processing: nbacirclejerk_submissions.zst


UnsupportedOperation: 

In [23]:
import zstandard as zstd
import json
import io

file_path = r"C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\nba_submissions.zst"  # Your file path
with open(file_path, "rb") as f:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(f) as reader:
        text_stream = io.TextIOWrapper(reader, encoding='utf-8')
        for line in text_stream:
            record = json.loads(line)
            # Now you can process each record individually
            print(record.get('selftext', ))  # Example: print first 100 chars of title
                




SyntaxError: incomplete input (2021494379.py, line 18)

In [27]:
import os
import zstandard as zstd
import json
import io
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime

# Path to your folder containing .zst files
folder_path = r"C:\Users\ninuy\DSC 672\Data\reddit\subreddits24"

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# NBA teams list (complete)
nba_teams = [
    "Lakers", "Bucks", "Warriors", "Nets", "Heat", "76ers", "Celtics", "Nuggets",
    "Suns", "Mavericks", "Clippers", "Raptors", "Knicks", "Pelicans", "Pacers", "Kings",
    "Bulls", "Magic", "Spurs", "Hornets", "Wizards", "Hawks", "Grizzlies", "Rockets",
    "Pistons", "Cavaliers", "Timberwolves", "Trail Blazers", "Jazz", "Thunder",
]

# Minimum timestamp: Jan 1, 2017 UTC
min_timestamp = int(datetime(2017, 1, 1).timestamp())

# List to hold sentiment data
all_team_data = []

# Loop through all .zst files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".zst"):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing file: {file_path}")
        
        with open(file_path, "rb") as f:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(f) as reader:
                text_stream = io.TextIOWrapper(reader, encoding='utf-8')

                for line in text_stream:
                    try:
                        submission = json.loads(line)
                    except json.JSONDecodeError:
                        continue  # Skip malformed lines

                    created_utc = submission.get("created_utc", 0)
                    try:
                        created_utc = int(created_utc)
                    except (ValueError, TypeError):
                        continue
                    
                    if created_utc < min_timestamp:
                        continue

                    title = submission.get("title", "")
                    selftext = submission.get("selftext", "")

                    combined_text = (title + " " + selftext).lower()

                    matched_teams = [team for team in nba_teams if team.lower() in combined_text]
                    if not matched_teams:
                        continue

                    sentiment_score = analyzer.polarity_scores(title + " " + selftext)

                    for team in matched_teams:
                        all_team_data.append({
                            "team": team,
                            "subreddit": submission.get("subreddit", ""),
                            "post_title": title,
                            "post_text": selftext,
                            "positive": sentiment_score["pos"],
                            "neutral": sentiment_score["neu"],
                            "negative": sentiment_score["neg"],
                            "compound": sentiment_score["compound"],
                            "created_at": created_utc,
                            "url": submission.get("url", ""),
                        })

# Save to CSV
df = pd.DataFrame(all_team_data)
df.to_csv(r"C:\Users\ninuy\DSC 672\data\sentiment\nba_teams_sentiment_post2017.csv", index=False)

print("Sentiment data for NBA teams (post-2017) collected and saved.")


Processing file: C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\nbacirclejerk_submissions.zst
Processing file: C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\nbadiscussion_submissions.zst
Processing file: C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\NBAForums_submissions.zst
Processing file: C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\nba_submissions.zst
Processing file: C:\Users\ninuy\DSC 672\Data\reddit\subreddits24\sportsbetting_submissions.zst
Sentiment data for NBA teams (post-2017) collected and saved.
