## PushShift.io to Request Subreddits

In [None]:
import requests
import pandas as pd
import time
import datetime as dt
import pickle
pd.set_option('display.max_columns', None)

#### Create function to request 10,000 subreddits for each class.

In [2]:
def get_subreddit(subreddit, n_samples):
    url = f'https://api.pushshift.io/reddit/submission/search'
    last_time = round(time.time())
    post_list = []
    while len(post_list) < n_samples:
        params = {
              'subreddit':subreddit,
              'sort':'desc',
              'size':n_samples,
              'before':last_time,
             }
        res = requests.get(url, params = params)
        posts = res.json()['data']
        if len(posts) == 0:
            last_time = last_time
            post_list.extend(posts)
        else:
            last_time = dt.datetime.fromtimestamp(posts[-1]["created_utc"])
            post_list.extend(posts)
            time.sleep(1) 
    return pd.DataFrame(post_list)

### Get Subreddit 1

In [4]:
%%time
df1 = get_subreddit('nosleep', n_samples=10_000)

CPU times: user 2.12 s, sys: 277 ms, total: 2.4 s
Wall time: 36.8 s


In [5]:
df1['created_utc'].nunique()

9994

In [6]:
df1.shape

(10000, 72)

In [7]:
df1.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,can_mod_post,content_categories,contest_mode,created_utc,domain,full_link,gildings,id,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable,is_self,is_video,link_flair_background_color,link_flair_richtext,link_flair_text,link_flair_text_color,link_flair_type,locked,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,pwls,removed_by_category,retrieved_on,score,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_subscribers,subreddit_type,thumbnail,title,total_awards_received,url,whitelist_status,wls,link_flair_css_class,link_flair_template_id,author_flair_background_color,author_flair_text_color,banned_by,edited,post_hint,preview,steward_reports,author_cakeday,gilded,distinguished,removed_by,updated_utc
0,[],False,rhonnie14,,[],,text,t2_f9qc2,False,False,[],False,[writing],False,1580239970,self.nosleep,https://www.reddit.com/r/nosleep/comments/eva9...,{},eva9b6,False,False,False,False,False,True,False,,[],Removed | Non-horror,dark,text,False,False,True,0,0,False,all_ads,/r/nosleep/comments/eva9b6/late_night_laundry/,False,6,moderator,1580246003,1,[removed],True,False,False,nosleep,t5_2rm4d,13739499,public,self,Late Night Laundry,0,https://www.reddit.com/r/nosleep/comments/eva9...,all_ads,6,,,,,,,,,,,,,,
1,[],False,AdonisAndOmega,,[],,text,t2_8h7vtwy,False,False,[],False,[writing],False,1580239486,self.nosleep,https://www.reddit.com/r/nosleep/comments/eva5...,{},eva589,False,False,False,False,False,True,False,,[],Series,dark,text,False,False,True,1,0,False,all_ads,/r/nosleep/comments/eva589/why_did_the_moon_di...,False,6,moderator,1580245582,2,[removed],True,False,False,nosleep,t5_2rm4d,13739485,public,self,Why Did The Moon Disappear? [Part One],0,https://www.reddit.com/r/nosleep/comments/eva5...,all_ads,6,flair-series,8beec82a-dcc1-11e8-a09f-0e09eae1a1c0,,,,,,,,,,,,
2,[],False,Izukumidoriya123,,[],,text,t2_2zcuit8m,False,False,[],False,[writing],False,1580239395,self.nosleep,https://www.reddit.com/r/nosleep/comments/eva4...,{},eva4fv,False,False,False,False,False,True,False,,[],,dark,text,False,False,True,1,0,False,all_ads,/r/nosleep/comments/eva4fv/you_dont_die_of_old...,False,6,moderator,1580245489,1,[removed],True,False,False,nosleep,t5_2rm4d,13739485,public,self,You don't die of old age (Part 1),0,https://www.reddit.com/r/nosleep/comments/eva4...,all_ads,6,,,,,,,,,,,,,,
3,[],False,AdonisAndOmega,,[],,text,t2_8h7vtwy,False,False,[],False,[writing],False,1580238771,self.nosleep,https://www.reddit.com/r/nosleep/comments/ev9z...,{},ev9z79,False,False,False,False,False,True,False,,[],Series,dark,text,False,False,True,1,0,False,all_ads,/r/nosleep/comments/ev9z79/why_did_the_moon_di...,False,6,moderator,1580243755,1,[removed],True,False,False,nosleep,t5_2rm4d,13739437,public,self,Why Did The Moon Disappear? [True Story] (part...,0,https://www.reddit.com/r/nosleep/comments/ev9z...,all_ads,6,flair-series,,,,,,,,,,,,,
4,[],False,[deleted],,,,,,,,[],False,[writing],False,1580237605,self.nosleep,https://www.reddit.com/r/nosleep/comments/ev9p...,{},ev9p6f,False,False,False,False,False,True,False,,[],,dark,text,False,False,True,2,0,False,all_ads,/r/nosleep/comments/ev9p6f/the_door_to_hell/,False,6,deleted,1580240898,1,,True,False,False,nosleep,t5_2rm4d,13739354,public,default,The Door To Hell,0,https://www.reddit.com/r/nosleep/comments/ev9p...,all_ads,6,,,,dark,moderators,,,,,,,,,


In [8]:
df1.isnull().sum()

all_awardings                0
allow_live_comments          0
author                       0
author_flair_css_class    9942
author_flair_richtext      114
                          ... 
author_cakeday            9975
gilded                    9977
distinguished             9999
removed_by                9545
updated_utc               9256
Length: 72, dtype: int64

In [9]:
subreddit_cols = list(df1.columns)
subreddit_cols;

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 72 columns):
all_awardings                    10000 non-null object
allow_live_comments              10000 non-null bool
author                           10000 non-null object
author_flair_css_class           58 non-null object
author_flair_richtext            9886 non-null object
author_flair_text                87 non-null object
author_flair_type                9886 non-null object
author_fullname                  9886 non-null object
author_patreon_flair             9886 non-null object
author_premium                   6086 non-null object
awarders                         10000 non-null object
can_mod_post                     10000 non-null bool
content_categories               10000 non-null object
contest_mode                     10000 non-null bool
created_utc                      10000 non-null int64
domain                           10000 non-null object
full_link                     

In [11]:
df1['selftext'].isnull().sum()

93

In [12]:
df1['title_self_text'] = df1['title'] + df1['selftext']

In [13]:
df1 = df1[['title', 'selftext', 'title_self_text', 'created_utc', 'subreddit']]
df1

Unnamed: 0,title,selftext,title_self_text,created_utc,subreddit
0,Late Night Laundry,[removed],Late Night Laundry[removed],1580239970,nosleep
1,Why Did The Moon Disappear? [Part One],[removed],Why Did The Moon Disappear? [Part One][removed],1580239486,nosleep
2,You don't die of old age (Part 1),[removed],You don't die of old age (Part 1)[removed],1580239395,nosleep
3,Why Did The Moon Disappear? [True Story] (part...,[removed],Why Did The Moon Disappear? [True Story] (part...,1580238771,nosleep
4,The Door To Hell,,,1580237605,nosleep
...,...,...,...,...,...
9995,Do not revisit childhood memories.,[Part 1](https://www.reddit.com/r/nosleep/comm...,Do not revisit childhood memories.[Part 1](htt...,1572582723,nosleep
9996,The Spider Bite,Halloween has always been my favorite holiday....,The Spider BiteHalloween has always been my fa...,1572582465,nosleep
9997,My brother collected eyes.,My brother collected eyes. Our mother said tha...,My brother collected eyes.My brother collected...,1572579807,nosleep
9998,I transcribe videos for the police - this is t...,"“That morning, my fiancé was unusually cold t...",I transcribe videos for the police - this is t...,1572579067,nosleep


### Get Subreddit 2

In [14]:
%%time
df2 = get_subreddit('creepypasta', n_samples=10_000)

CPU times: user 1.37 s, sys: 176 ms, total: 1.55 s
Wall time: 28.5 s


In [15]:
df2.shape

(10000, 82)

In [16]:
# df2.drop_duplicates(inplace=True)

In [17]:
df2['created_utc'].nunique()

9995

In [18]:
df2['title_self_text'] = df2['title'] + df2['selftext']

In [19]:
df2 = df2[['title', 'selftext', 'title_self_text', 'created_utc', 'subreddit']]
df2

Unnamed: 0,title,selftext,title_self_text,created_utc,subreddit
0,"""Accounts from a Lonely Broadcast Station"" [SE...",,"""Accounts from a Lonely Broadcast Station"" [SE...",1580241297,creepypasta
1,Best Series or Long creepypasta,I’m looking for a long series or creepypasta s...,Best Series or Long creepypastaI’m looking for...,1580238886,creepypasta
2,Schrödinger's Paranoia,,Schrödinger's Paranoia,1580234558,creepypasta
3,School shooting youtube video,its about a guy that was getting bullied in sc...,School shooting youtube videoits about a guy t...,1580233347,creepypasta
4,Only two of those are actually creepy. If you ...,,Only two of those are actually creepy. If you ...,1580227521,creepypasta
...,...,...,...,...,...
9995,"24/7 Scary Stories, Nosleep Horror Stories &am...",,"24/7 Scary Stories, Nosleep Horror Stories &am...",1550213213,creepypasta
9996,They Never Learn Creepypasta by IPostAtMidnight,,They Never Learn Creepypasta by IPostAtMidnight,1550211532,creepypasta
9997,Do i pay for Reddit creepypasta?,"Hi, im starting a Polish language creepypasta ...","Do i pay for Reddit creepypasta?Hi, im startin...",1550209406,creepypasta
9998,DDD,"Now this beast has been let loose, and nothing...","DDDNow this beast has been let loose, and noth...",1550208465,creepypasta


### Combine Subreddit DataFrames

In [20]:
df = pd.concat([df1, df2])

In [21]:
df.head()

Unnamed: 0,title,selftext,title_self_text,created_utc,subreddit
0,Late Night Laundry,[removed],Late Night Laundry[removed],1580239970,nosleep
1,Why Did The Moon Disappear? [Part One],[removed],Why Did The Moon Disappear? [Part One][removed],1580239486,nosleep
2,You don't die of old age (Part 1),[removed],You don't die of old age (Part 1)[removed],1580239395,nosleep
3,Why Did The Moon Disappear? [True Story] (part...,[removed],Why Did The Moon Disappear? [True Story] (part...,1580238771,nosleep
4,The Door To Hell,,,1580237605,nosleep


In [22]:
df.shape

(20000, 5)

#### Save to `.csv`

In [29]:
df.to_csv('../../../combined_subreddits.csv', index=False)

### Pickle

In [28]:
pickle.dump(df, open("../assets/combined_subreddits.pkl", "wb"))