In [33]:
#!pip install --upgrade networkx==2.6
#!pip install --upgrade scipy==1.8.0
#!pip install praw
#!pip install stanza
#!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp39-cp39-win_amd64.whl (8.4 MB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed joblib-1.2.0 scikit-learn-1.2.1


In [1]:
import re
import os
import io
import praw
import time
import math
import string

import numpy as np
import pandas as pd
import datetime as dt

from wordcloud import WordCloud
from string import punctuation

<h3>1. Getting text data from reddit with PRAW</h3>

In [None]:
#authentication information for PRAW
client_id = 'H61SWl0sMBRXTQ'
secret = 'Kqx0-4gpMhNjFJv95Wt3hf71xwvVfg'
user_agent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/109.0.0.0 Safari/537.36"

#initialize connection
reddit = praw.Reddit(
    client_id = client_id,
    client_secret = secret,
    user_agent = user_agent,
)

In [None]:
#empty df to hold submission stats
df = pd.DataFrame(columns = [
    'created', 'id', 
    'content', 'num_comments', 
    'score', 'type', 
    'post_affiliation'
])

#get top 1000 most popular posts over the last year
subreddit = reddit.subreddit('HarryPotterGame')
for submission in subreddit.top(limit = 1000, time_filter = 'year'):
    #print and save post info
    print(f'{dt.datetime.fromtimestamp(submission.created_utc)} |\
    post_id: {submission}\
    upvotes: {submission.score} |\
    comment count: {submission.num_comments}')
    print(f'{submission.title}\n')
    print("*"*100)
    
    df.loc[len(df)] = [
        dt.datetime.fromtimestamp(submission.created_utc),
        submission,
        submission.title,
        submission.num_comments,
        submission.score,
        'post',
        np.nan
    ]
    
    #print and save comment info
    for comment in submission.comments[:40]:
        print(f"created: {dt.datetime.fromtimestamp(comment.created_utc)} |\
        upvotes: {comment.score} |\
        comment_id: {comment}")
        print(f'{comment.body}\n')
        
        df.loc[len(df)] = [
            dt.datetime.fromtimestamp(comment.created_utc),
            comment,
            comment.body,
            np.nan,
            comment.score,
            'comment',
            submission
        ]
    print('='*100)
print('completed')

In [2]:
#load file
df = pd.read_csv('top_posts_year.csv')
df.head()

Unnamed: 0,created,id,content,num_comments,score,type,post_affiliation
0,10/2/2023 5:01,10y62p3,It be crucial,298.0,6642,post,
1,10/2/2023 7:13,j7wremr,I hate casting Revelio just slightly too far a...,,714,comment,10y62p3
2,10/2/2023 6:33,j7wlltr,There’s a talent to upgrade the distance. Anyo...,,114,comment,10y62p3
3,10/2/2023 9:03,j7x6q78,Quality of life change......keep Lumos on when...,,100,comment,10y62p3
4,10/2/2023 5:14,j7w97d8,I just hate that it took me a solid 5 hours be...,,336,comment,10y62p3


In [3]:
#get month and year of creation for splitting of data later
df['created'] = pd.to_datetime(df['created'])
df['month'] = df['created'].dt.month
df['year'] = df['created'].dt.year

In [8]:
#condense into function requiring only main dataframe, post dataframe, number of posts to grab
def get_sample(df, post_df, n_posts, rs):
    #create empty dataframe to hold posts and comments
    post_pool = pd.DataFrame(columns = df.columns)
    
    #get posts with 20 or more comments
    for id in post_df['id']:
        if len(df.loc[df['post_affiliation'] == id].index) >= 20:
            row = post_df.loc[post_df['id'] == id]
            post_pool = post_pool.append(row)

    #proportion of posts/comments to take from each time period
    split_proportion = pd.DataFrame(post_pool.groupby(['month', 'year']).size() / len(post_df.index))
    
    #determine multiplier to apply to proportion df to obtain n_posts and 20 * n_posts comments
    multiplier = 0
    for n in range(0, 1000):
        sample_size = split_proportion.iloc[:,0].apply(lambda x: math.ceil(x*n)).sum()
        if sample_size >= n_posts:
            multiplier = n
            break
            
    #create empty dataframe to hold selected comments and post
    selected = pd.DataFrame()

    #for posts from each time period
    for (month, year) in split_proportion.index:
        #get probability of posts to retrieve for this time period
        prop = split_proportion.loc[split_proportion.index == (month, year)].iat[0, 0]
        selected_posts = post_pool.loc[(post_pool['month'] == month) & (post_pool['year'] == year)]\
        .sample(n = math.ceil(prop*74), random_state = rs)

        #add selected posts to the results dataframe
        if len(selected.index) == 0:
            selected = selected_posts
        else:
            selected = pd.concat([selected, selected_posts])

        #filter out and randomly sample (proportion x total rows) number of rows
        for pid in selected_posts['id']:
            #randomly sample 20 comments
            comment_pool = df.loc[df['post_affiliation'] == pid]
            comments = comment_pool.sample(n = 20, random_state = rs)
            #add selected comments to the results dataframe
            selected = pd.concat([selected, comments])
    print()
    return selected

In [10]:
#posts before early access date of 7 Feb 2023 6pm GMT
posts_bef = df.loc[(pd.to_datetime(df['created'])<dt.datetime(2023,2,7,6,0,0)) & (df['type'] == 'post')]

#posts after early access date
posts_aft = df.loc[(pd.to_datetime(df['created'])>dt.datetime(2023,2,7,6,0,0)) & (df['type'] == 'post')]

In [27]:
#pd.concat([sample_bef, sample_aft]).to_csv('to_label.csv', index = False)