In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

In [88]:
# Collect some data to evaluate columns
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=unresolvedmysteries&before=1642778603&size=100'
res = requests.get(url)
data = pd.DataFrame(res.json()['data'])
data.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_sub

In [89]:
data['created_utc'].min()

1642229447

In [24]:
# Check submission text value counts
# 'selftext' will be the bulk of the data so I am looking to only collect
# posts without 'removed' or 'na' in selftext
data[data['selftext'] == '[removed]'].head(1)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,whitelist_status,wls,link_flair_css_class,link_flair_template_id,link_flair_text,post_hint,preview,author_flair_background_color,author_flair_text_color,banned_by
0,[],False,Krak_Fox,,[],,text,t2_su2vm,False,False,...,all_ads,6,,,,,,,,


It appears the string '[removed]' is the key to filtering empty posts

I also want to make sure the 'subreddit' feature correctly classifies where the post is from, maybe it includes crossposts?

In [25]:
data['subreddit'].value_counts()

UnresolvedMysteries    25
Name: subreddit, dtype: int64

In [104]:
# Determine attractive columns to include in data.
# Won't include all features as to reduce data cleaning workload
features = [
    'author', 
    'created_utc',
    'selftext',
    'subreddit',
    'title',
    'total_awards_received',
]

In [106]:
# This function built on Chuck's Breakfast Hour example
def pushshift_query(subreddit='unresolvedmysteries', features=features, num_loops=10):
    current_time = 1642778603
    posts = []
    for query in range(num_loops):
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&before={current_time}&size=100'
        res = requests.get(url)
        for post in res.json()['data']:
            post_dict = {}
            for feature in features:
                try:
                    post_dict[feature] = post[feature]
                except:
                    post_dict[feature] = np.nan
            posts.append(post_dict)
            current_time = pd.DataFrame(posts)['created_utc'].min()
        print(f'Posts before {current_time}. Current data frame has {len(posts)} rows')
        time.sleep(5)
    return pd.DataFrame(posts)

In [95]:
data_unre = pushshift_query(subreddit='unresolvedmysteries', features=features)

Posts before 1642229447. Current data frame has 100 rows
Posts before 1641917400. Current data frame has 200 rows
Posts before 1641402915. Current data frame has 300 rows
Posts before 1640904628. Current data frame has 400 rows
Posts before 1640206221. Current data frame has 500 rows
Posts before 1639654755. Current data frame has 600 rows
Posts before 1639054652. Current data frame has 700 rows
Posts before 1638531315. Current data frame has 800 rows
Posts before 1638137112. Current data frame has 900 rows
Posts before 1637584252. Current data frame has 1000 rows


In [102]:
# Check for duplicates
data_unre.duplicated().sum()

1

In [103]:
# Check for NaNs
data_unre.isna().sum()

author                     0
author_flair_text        988
created_utc                0
selftext                  12
subreddit                  0
title                      0
total_awards_received      0
dtype: int64

In [107]:
data_unre.shape

(1000, 7)

In [None]:
data_unre.to_csv('../data/unresolved.csv')

In [None]:
data_unsol = pushshift_query(subreddit='unsolvedmysteries', features=features)