In [5]:
import pandas as pd
import zstandard as zstd
import json

In [6]:
paths = 'data/UCSD_submissions.zst'

In [7]:
data = []
with open("data/UCSD_submissions.zst", 'rb') as fh:
    dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(2**24)  # 16mb chunks
            if not chunk:
                break

            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                if i == 0:
                    line = previous_line + line
                object = json.loads(line)
                data.append(object)
                # do something with the object here
            previous_line = lines[-1]
with open("data/UCSD_comments.zst", 'rb') as fh:
    dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(2**24)  # 16mb chunks
            if not chunk:
                break

            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                if i == 0:
                    line = previous_line + line
                object = json.loads(line)
                data.append(object)
                # do something with the object here
            previous_line = lines[-1]
df = pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,...,is_submitter,quarantined,steward_reports,associated_award,collapsed_because_crowd_control,comment_type,collapsed_reason_code,author_is_blocked,unrepliable_reason,editable
0,True,spleeyah,,,"[{'e': 'text', 't': 'Class of '12 | Q29tcHV0ZX...",Class of '12 | Q29tcHV0ZXIgU2NpZW5jZQo= (B.S.),,richtext,True,True,...,,,,,,,,,,
1,True,spleeyah,,,"[{'e': 'text', 't': 'Class of '12 | Q29tcHV0ZX...",Class of '12 | Q29tcHV0ZXIgU2NpZW5jZQo= (B.S.),,richtext,True,True,...,,,,,,,,,,
2,True,[deleted],,,,,dark,,True,False,...,,,,,,,,,,
3,True,xxbondsxx,,,[],,,text,True,True,...,,,,,,,,,,
4,True,SnowdensOfYesteryear,,,[],,,text,True,True,...,,,,,,,,,,


In [9]:
# see all the columns
for i in range(0, len(df.columns), 10):
    print(df.columns[i:i+10])

Index(['archived', 'author', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'brand_safe',
       'can_gild'],
      dtype='object')
Index(['contest_mode', 'created_utc', 'distinguished', 'domain', 'edited',
       'gilded', 'hidden', 'hide_score', 'id', 'is_crosspostable'],
      dtype='object')
Index(['is_reddit_media_domain', 'is_self', 'is_video', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_text', 'link_flair_text_color',
       'link_flair_type', 'locked', 'media'],
      dtype='object')
Index(['media_embed', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'retrieved_on', 'rte_mode',
       'score'],
      dtype='object')
Index(['secure_media', 'secure_media_embed', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_name_prefixed', '

In [11]:
df.shape

(814431, 146)

## Data Wrangling

To reduce the dataset and keep it relevant, I will only use the posts from beginning of 2017 to end of 2022.

In [12]:
dates = pd.to_datetime(df['created_utc'], unit = 's')

In [13]:
# to reduce the dataset, find entries that date from the beginning of 2017
df['created_date'] = dates
new_df = df[df['created_date'] >= '2017-1-01']

In [14]:
new_df.shape

(706829, 147)

In [15]:
# get necessary text from selftext (the content of the original submissions)
# and the bodies of comments of the submissions
new_df = new_df[['selftext','body']]

In [16]:
# we can see that one of the two, selftext and body columns, are nan as they only can have one or the other.
new_df.head()

Unnamed: 0,selftext,body
12183,Happy New Year UCSD! Hope everyone reflected o...,
12184,Friday so you can have the weekend to chill? T...,
12185,Since the S. Quarterly parking permit isn't av...,
12186,I'm at a loss nowadays. I'm a third year NanoE...,
12187,"Hey guys, \n\nI'm wondering what the best way ...",


In [17]:
# set dataframe to contain both selftext and body columns into a combined column called text
text = new_df['selftext'].fillna(new_df['body'])
combined = pd.DataFrame(text)
combined.columns = ['text']
combined.head()

Unnamed: 0,text
12183,Happy New Year UCSD! Hope everyone reflected o...
12184,Friday so you can have the weekend to chill? T...
12185,Since the S. Quarterly parking permit isn't av...
12186,I'm at a loss nowadays. I'm a third year NanoE...
12187,"Hey guys, \n\nI'm wondering what the best way ..."


In [18]:
# remove all the '[removed]', '[deleted]', and empty entries for text
cond1 = ~(combined['text'] == '')
cond2 = ~(combined['text'] == '[removed]')
cond3 = ~(combined['text'] == '[deleted]')
combined = combined[cond1 & cond2 & cond3]
combined.shape

(624471, 1)

In [19]:
# save combined
combined.head()

Unnamed: 0,text
12183,Happy New Year UCSD! Hope everyone reflected o...
12184,Friday so you can have the weekend to chill? T...
12185,Since the S. Quarterly parking permit isn't av...
12186,I'm at a loss nowadays. I'm a third year NanoE...
12187,"Hey guys, \n\nI'm wondering what the best way ..."


In [20]:
combined.iloc[0:10]['text']

12183    Happy New Year UCSD! Hope everyone reflected o...
12184    Friday so you can have the weekend to chill? T...
12185    Since the S. Quarterly parking permit isn't av...
12186    I'm at a loss nowadays. I'm a third year NanoE...
12187    Hey guys, \n\nI'm wondering what the best way ...
12188    my friend is switching out of an impacted majo...
12189    If I ask for a grade change for last quarter n...
12190    I forgot my ID and room key at home so I need ...
12191    I'm a high school junior in the SoCal area int...
12192          Sick of dining hall food so help me out fam
Name: text, dtype: object

In [24]:
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader

In [25]:
print(torch.cuda.is_available())  # Check if CUDA is available
print(torch.cuda.device_count())  # Check the number of GPUs available
print(torch.cuda.get_device_name(0))  # Get the name of the available GPU

False
0


AssertionError: Torch not compiled with CUDA enabled