# Table of Contents

1. [Imports](#Imports)
2. [Data Cleaning](#Data-Cleaning)

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import time
from datetime import datetime

In [2]:
def get_reddit(subreddit, count_size, batch_size):
    count = 0
    reddit_posts = []
    
    for i in range(1): #Prime the first set of 100 posts to bring in "before" param in the while loop
        url = "https://api.pushshift.io/reddit/search/submission"
        params = {
            'subreddit': subreddit,
            'size': batch_size}

        res = requests.get(url, params)
        data = res.json()
        posts = data['data']

        for i in range(batch_size):
            single = posts[i]
            reddit_posts.append(single)
            
        count += 1 #Count now equals 1 for the first set of 100 posts
    
        while count < count_size:
        # While loop will bring in count_size sets of 100 posts each. reddit_posts = count_size * 100
            params2 = {
                'subreddit': subreddit,
                'size': batch_size,
                'before': int(reddit_posts[-1]['created_utc'])}

            res2 = requests.get(url, params2)
            data2 = res2.json()
            posts2 = data2['data']

            for i in range(batch_size):
                single2 = posts2[i]
                reddit_posts.append(single2)

            #progress printouts
            #Code adapted from global lect: NLP I
            if (count + 1) % 20 == 0:
                print(f'Round {count + 1} of {count_size}') #printing checks to help visualize that the fxn is working
            count += 1
            time.sleep(2)
    
    print("Creating dataframe...")
    df = pd.DataFrame(data = reddit_posts)      
    df = df[['subreddit', 'author', 'title', 'selftext', 'score', 'created_utc']]

    
    print("Done")
    return df

# Returns DataFrame. Set equal to a variable!

In [3]:
%%time
df_oni = get_reddit('Oxygennotincluded', 150, 100) #10_000 posts

Round 20 of 150
Round 40 of 150
Round 60 of 150
Round 80 of 150
Round 100 of 150
Round 120 of 150
Round 140 of 150
Creating dataframe...
Done
CPU times: user 4.36 s, sys: 362 ms, total: 4.72 s
Wall time: 9min 29s


In [4]:
%%time
df_rw = get_reddit('RimWorld', 150, 100) #10_000 posts


Round 20 of 150
Round 40 of 150
Round 60 of 150
Round 80 of 150
Round 100 of 150
Round 120 of 150
Round 140 of 150
Creating dataframe...
Done
CPU times: user 4.51 s, sys: 322 ms, total: 4.84 s
Wall time: 9min 7s


In [5]:
print(df_oni.shape)
df_oni.head()

(15000, 6)


Unnamed: 0,subreddit,author,title,selftext,score,created_utc
0,Oxygennotincluded,TrumpByPublicOpinion,Mod Idea - In game Programmable Logic Controllers,"Hey everyone,\n\nBeen playing the game some ti...",1,1599762597
1,Oxygennotincluded,pannaEmilka,"It's ugly, but it's mine ❤️ and it's working!",,1,1599762459
2,Oxygennotincluded,The_Mr_Tact,Get to work bum! :D,,1,1599752105
3,Oxygennotincluded,DaHedgehog27,Mod Request - Removing Construction Supply fro...,I have zero clue when it comes to modding this...,1,1599752065
4,Oxygennotincluded,spiralmadness,Cycle 1337: Solo dupe all (possible) achievements,,1,1599750841


In [6]:
print(df_rw.shape)
df_rw.head()

(15000, 6)


Unnamed: 0,subreddit,author,title,selftext,score,created_utc
0,RimWorld,Liftedsafe01,Patricia Hall vjacheslavgerasimov9,[removed],1,1599778395
1,RimWorld,_SnooPeppers_,They start so young these days...,,1,1599777668
2,RimWorld,Tango-454,hey i found your rimworld tribal gameplay,,1,1599776037
3,RimWorld,HDnfbp,"Just came back to the game, randy send me a go...",&amp;#x200B;\n\nhttps://preview.redd.it/1ce5za...,1,1599775881
4,RimWorld,Ecco_Shack,"Help, I'm scared....",,1,1599775728


In [7]:
df = pd.concat([df_oni.copy(), df_rw.copy()])

In [8]:
df.shape

(30000, 6)

# Data Cleaning

In [9]:
def clean_selftext(cell):
    if cell == "":
        return np.nan
    elif cell == '[removed]':
        return np.nan
    elif cell == '[deleted]':
        return np.nan
    else:
        return cell

In [10]:
df['selftext'] = df['selftext'].map(clean_selftext)

In [11]:
def get_time(utc_cell):
    if utc_cell != np.nan:
        return str(datetime.fromtimestamp(utc_cell))
        #code adapted from https://www.programiz.com/python-programming/datetime/timestamp-datetime
    else:
        return np.nan

In [12]:
df['eastern_time'] = df['created_utc'].map(get_time)

In [13]:
df.isna().sum()

subreddit           0
author              0
title               0
selftext        13423
score               0
created_utc         0
eastern_time        0
dtype: int64

In [14]:
df.dropna(inplace = True)

In [15]:
print(df.shape)
df.isna().sum()

(16577, 7)


subreddit       0
author          0
title           0
selftext        0
score           0
created_utc     0
eastern_time    0
dtype: int64

In [16]:
df.head()

Unnamed: 0,subreddit,author,title,selftext,score,created_utc,eastern_time
0,Oxygennotincluded,TrumpByPublicOpinion,Mod Idea - In game Programmable Logic Controllers,"Hey everyone,\n\nBeen playing the game some ti...",1,1599762597,2020-09-10 14:29:57
3,Oxygennotincluded,DaHedgehog27,Mod Request - Removing Construction Supply fro...,I have zero clue when it comes to modding this...,1,1599752065,2020-09-10 11:34:25
5,Oxygennotincluded,KittehNevynette,Fertiliser flatulence power question. How much?,I saw a YT video where fertiliser synthesisers...,1,1599745896,2020-09-10 09:51:36
7,Oxygennotincluded,Turalyon135,Automation question,Is there a way using a gate or any kind to hav...,1,1599737435,2020-09-10 07:30:35
8,Oxygennotincluded,FoxRealistic3370,Perseverance or Experience,SOOOOO im on about cycle 1500 of my first base...,1,1599730988,2020-09-10 05:43:08


In [17]:
df.reset_index(drop=True, inplace=True)

In [18]:
df['post_word_count'] = [len(df['selftext'][i].split()) for i in range(0, len(df['selftext']))]

In [19]:
df['title_word_count'] = [len(df['title'][i].split()) for i in range(0, len(df['title']))]

In [20]:
df.head()

Unnamed: 0,subreddit,author,title,selftext,score,created_utc,eastern_time,post_word_count,title_word_count
0,Oxygennotincluded,TrumpByPublicOpinion,Mod Idea - In game Programmable Logic Controllers,"Hey everyone,\n\nBeen playing the game some ti...",1,1599762597,2020-09-10 14:29:57,108,8
1,Oxygennotincluded,DaHedgehog27,Mod Request - Removing Construction Supply fro...,I have zero clue when it comes to modding this...,1,1599752065,2020-09-10 11:34:25,104,8
2,Oxygennotincluded,KittehNevynette,Fertiliser flatulence power question. How much?,I saw a YT video where fertiliser synthesisers...,1,1599745896,2020-09-10 09:51:36,159,6
3,Oxygennotincluded,Turalyon135,Automation question,Is there a way using a gate or any kind to hav...,1,1599737435,2020-09-10 07:30:35,148,2
4,Oxygennotincluded,FoxRealistic3370,Perseverance or Experience,SOOOOO im on about cycle 1500 of my first base...,1,1599730988,2020-09-10 05:43:08,294,3


In [22]:
# df.to_csv('./data/subreddits.csv', index=False)