In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

In [3]:
# Collect some data to evaluate columns
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=unresolvedmysteries&before=1642778603&size=100'
res = requests.get(url)
data = pd.DataFrame(res.json()['data'])
data.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_sub

In [4]:
data.dtypes

all_awardings                    object
allow_live_comments                bool
author                           object
author_flair_css_class           object
author_flair_richtext            object
                                  ...  
preview                          object
removed_by_category              object
author_flair_background_color    object
author_flair_text_color          object
banned_by                        object
Length: 69, dtype: object

In [6]:
# Check submission text value counts
# 'selftext' will be the bulk of the data so I am looking to only collect
# posts without 'removed' or 'na' in selftext
data[data['selftext'] == '[removed]']

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,upvote_ratio,url,whitelist_status,wls,post_hint,preview,removed_by_category,author_flair_background_color,author_flair_text_color,banned_by
2,[],False,Starasolum,,[],,text,t2_agtepedt,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
5,[],False,oopadoops,,[],,text,t2_9lk1g536,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
6,[],False,DE4D2000,,[],,text,t2_6a84398y,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
9,[],False,BitterRecover3379,,[],,text,t2_cn4mwr9s,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'MDOgL7IH...",moderator,,,
10,[],False,winnievelvet98,,[],,text,t2_88c2fvkm,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
14,[],False,heytherefakenerds,,[],,text,t2_1qapb1wk,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
17,[],False,prajitoruldinoz,,[],,text,t2_11ta0u,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'GG8oNekR...",moderator,,,
18,[],False,prajitoruldinoz,,[],,text,t2_11ta0u,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'GG8oNekR...",moderator,,,
19,[],False,j0nd0,,[],,text,t2_12961x,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'trgr9WRW...",moderator,,,
20,[],False,j0nd0,,[],,text,t2_12961x,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'dfrGj5kp...",moderator,,,


It appears the string '[removed]' is the key to filtering empty posts

I also want to make sure the 'subreddit' feature correctly classifies where the post is from, maybe it includes crossposts?

In [8]:
# Determine attractive columns to include in data.
# Won't include all features as to reduce data cleaning workload
features = [
    'author', 
    'created_utc',
    'selftext',
    'subreddit',
    'title',
    'total_awards_received',
]

In [9]:
# This function built on Chuck's Breakfast Hour example
def pushshift_query(subreddit='unresolvedmysteries', features=features, num_loops=10):
    current_time = 1642778603
    posts = []
    for query in range(num_loops):
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&before={current_time}&size=100'
        res = requests.get(url)
        for post in res.json()['data']:
            post_dict = {}
            for feature in features:
                try:
                    post_dict[feature] = post[feature]
                except:
                    post_dict[feature] = np.nan
            posts.append(post_dict)
            current_time = pd.DataFrame(posts)['created_utc'].min()
        print(f'Posts before {current_time}. Current data frame has {len(posts)} rows')
        time.sleep(5)
    return pd.DataFrame(posts)

In [10]:
data_unre = pushshift_query(subreddit='unresolvedmysteries', features=features)

Posts before 1642229447. Current data frame has 100 rows
Posts before 1641917400. Current data frame has 200 rows
Posts before 1641402915. Current data frame has 300 rows
Posts before 1640904628. Current data frame has 400 rows
Posts before 1640206221. Current data frame has 500 rows
Posts before 1639654755. Current data frame has 600 rows
Posts before 1639054652. Current data frame has 700 rows
Posts before 1638531315. Current data frame has 800 rows
Posts before 1638137112. Current data frame has 900 rows
Posts before 1637584252. Current data frame has 1000 rows


In [11]:
# Check for duplicates
data_unre.duplicated().sum()

1

In [12]:
# Check for NaNs
data_unre.isna().sum()

author                    0
created_utc               0
selftext                 12
subreddit                 0
title                     0
total_awards_received     0
dtype: int64

In [13]:
data_unre.shape

(1000, 6)

In [15]:
data_unre.to_csv('../data/unresolved.csv')

In [16]:
data_unsol = pushshift_query(subreddit='unsolvedmysteries', features=features)

Posts before 1640932960. Current data frame has 100 rows
Posts before 1638513082. Current data frame has 200 rows
Posts before 1636704913. Current data frame has 300 rows
Posts before 1634841846. Current data frame has 400 rows
Posts before 1633270156. Current data frame has 499 rows
Posts before 1631883926. Current data frame has 599 rows
Posts before 1629965327. Current data frame has 699 rows
Posts before 1627594091. Current data frame has 799 rows
Posts before 1625425551. Current data frame has 899 rows
Posts before 1623457357. Current data frame has 999 rows


In [17]:
data_unsol.to_csv('../data/unsolved.csv')

In [18]:
data_unre[data_unre['selftext'] == '[removed]'].shape

(525, 6)

In [19]:
data_unsol[data_unsol['selftext'] == '[removed]'].shape

(0, 6)