# Data Collector
This notebook collects data from my selected subreddits and saves the data as csv files.

## Imports

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

## Pushshift API

Collect some data to test API.

In [10]:
# Pull data form the Pushshift API using requests.get()
# URL pulls submissions from UNresolvedMysteries subreddit, before a specific epoch time and gets 100 rows
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=unresolvedmysteries&before=1642778603&size=100'
res = requests.get(url)

# Turn the response into a DataFrame and see the first 5 rows
data = pd.DataFrame(res.json()['data'])
data.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,upvote_ratio,url,whitelist_status,wls,post_hint,preview,removed_by_category,author_flair_background_color,author_flair_text_color,banned_by
0,[],False,Ok-Development-5805,,[],,text,t2_9womdgl4,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,,,,
1,[],False,Odd_Ad1962,,[],,text,t2_5zvncy2o,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'hpInOvKp...",,,,
2,[],False,Starasolum,,[],,text,t2_agtepedt,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,,,moderator,,,
3,[],False,SwissChocolate1024,,[],,text,t2_303cypb,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'fzN5-DY-...",,,,
4,[],False,HoloGalaxy,,[],,text,t2_2jhn6qgk,False,False,...,1.0,https://www.reddit.com/r/UnresolvedMysteries/c...,all_ads,6,self,"{'enabled': False, 'images': [{'id': 'qioXaA1R...",,,,


In [11]:
# Pull data form the Pushshift API using requests.get()
# URL pulls submissions from UnsolvedMysteries subreddit, before a specific epoch time and gets 100 rows
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=unsolvedmysteries&before=1642778603&size=100'
res = requests.get(url)

# Turn the response into a DataFrame and see the first 5 rows
data2 = pd.DataFrame(res.json()['data'])
data2.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,wls,media,media_embed,post_hint,preview,removed_by_category,secure_media,secure_media_embed,thumbnail_height,thumbnail_width
0,[],False,HauntedSpy,,[],,text,t2_1pugixbf,False,False,...,7,,,,,,,,,
1,[],False,seaweed571,,[],,text,t2_ha2holzz,False,False,...,7,"{'oembed': {'author_name': 'RetroNewsNow', 'au...","{'content': '&lt;blockquote class=""twitter-vid...",link,"{'enabled': False, 'images': [{'id': 'IkKlZyaE...",reddit,"{'oembed': {'author_name': 'RetroNewsNow', 'au...","{'content': '&lt;blockquote class=""twitter-vid...",140.0,140.0
2,[],False,amkakis,,[],,text,t2_1ytaicce,False,False,...,7,,,link,"{'enabled': False, 'images': [{'id': 'GtlBDf86...",,,,140.0,140.0
3,[],False,HauntedSpy,,[],,text,t2_1pugixbf,False,False,...,7,,,link,"{'enabled': False, 'images': [{'id': '9T3ZMQEK...",,,,78.0,140.0
4,[],False,aka_pebbles,,[],,text,t2_iu73k6kk,False,False,...,7,,,link,"{'enabled': False, 'images': [{'id': 'F8-MuBbY...",,,,78.0,140.0


Got data OK.

## Examine data to determine interesting features
In this section I review a row of data to see what features can be relevant to the model.

In [12]:
for column in data.columns:
    print(column, str(data.loc[0, column]))

all_awardings []
allow_live_comments False
author Ok-Development-5805
author_flair_css_class None
author_flair_richtext []
author_flair_text None
author_flair_type text
author_fullname t2_9womdgl4
author_is_blocked False
author_patreon_flair False
author_premium False
awarders []
can_mod_post False
contest_mode False
created_utc 1642776199
domain self.UnresolvedMysteries
full_link https://www.reddit.com/r/UnresolvedMysteries/comments/s9ca4j/tell_me_how_do_you_sleep_at_night_knowing_what/
gildings {}
id s9ca4j
is_created_from_ads_ui False
is_crosspostable True
is_meta False
is_original_content False
is_reddit_media_domain False
is_robot_indexable True
is_self True
is_video False
link_flair_background_color #f1e8df
link_flair_css_class murder
link_flair_richtext []
link_flair_template_id 39fd698e-a96f-11e3-96ce-12313b0cf528
link_flair_text Murder
link_flair_text_color dark
link_flair_type text
locked False
media_only False
no_follow True
num_comments 0
num_crossposts 0
over_18 False
pare

In [13]:
for column in data2.columns:
    print(column, str(data2.loc[0, column]))

all_awardings []
allow_live_comments False
author HauntedSpy
author_flair_css_class None
author_flair_richtext []
author_flair_text None
author_flair_type text
author_fullname t2_1pugixbf
author_is_blocked False
author_patreon_flair False
author_premium False
awarders []
can_mod_post False
contest_mode False
created_utc 1642748513
domain eu.hollandsentinel.com
full_link https://www.reddit.com/r/UnsolvedMysteries/comments/s94u4r/dna_doe_project_ids_a_jane_doe_found_murdered_in/
gildings {}
id s94u4r
is_created_from_ads_ui False
is_crosspostable True
is_meta False
is_original_content False
is_reddit_media_domain False
is_robot_indexable True
is_self False
is_video False
link_flair_background_color 
link_flair_richtext []
link_flair_template_id 6bcac5b6-5775-11e8-975b-0e42b97c280c
link_flair_text UPDATE
link_flair_text_color dark
link_flair_type text
locked False
media_only False
no_follow False
num_comments 0
num_crossposts 0
over_18 False
parent_whitelist_status some_ads
permalink /r/Un

- Author: Subreddits tend to have regular posters/participators. This can be important in predicting what subreddit a submission was posted to.
- Awarders: Appears to be a list of users who have given the post and "award". Same as author: people tend to concentrate on subreddits they like.
- Created UTC: Necessary for API function
- Self Text: Main text content of the post. Probably most important feature.
- Subreddit: TARGET
- Title: Good source of relevant text.

## Feature selection

In [15]:
# List of features to include in data
features = [
    'author',
    'awarders',
    'created_utc',
    'selftext',
    'subreddit',
    'title',
]

## API request function
This function pulls data from the API and keeps the features listed in [Feature selection](#feature-selection).

Function improvements to-do:
- ~~Get initial current time automatically~~
- ~~Add DOCSTRING or similar~~
- ~~Keep function from adding removed posts to DataFrame~~
- ~~Add while loop so function runs until there are enough (500?) valid posts (not 'removed')~~

The function is a little slow but produces relatively clean and complete data.

In [16]:
# This function built on Chuck's Breakfast Hour example
def pushshift_query(subreddit='unresolvedmysteries', features=features):
    '''
    Generate a DataFrame of posts (submissions) to a particular subreddit, including only a list of specified features from the submission.
    
    
    Args:
        subreddit (str): subreddit from which to retrieve posts
        features (list): a list of valid features to include in resulting DataFrame
        
    Returns:
        posts (DF): DataFrame containing valid posts and features
    '''
    #Get current time to pass to API request
    current_time = int(time.time())

    # Empty list to append valid posts pulled from API
    posts = []

    # Run API requests until 1000 valid posts have been collected. DANGER: BEWARE INFINITE LOOP.
    while len(posts) < 1000:

        # Pushshift API
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&before={current_time}&size=100'
        res = requests.get(url)
        
        # Sanity check
        print(res.status_code)
        
        # Cycle through response posts
        for post in res.json()['data']:
            try:
                # Skip removed posts
                if post['selftext'] == '[removed]':
                    continue
            except:
                None
            post_dict = {}
            
            # Assign relevant features
            for feature in features:
                try:
                    post_dict[feature] = post[feature]
                except:
                    post_dict[feature] = np.nan
            
            # Append to posts list
            posts.append(post_dict)
            
            # Reset current time to time of oldest post for next request
            current_time = pd.DataFrame(posts)['created_utc'].min()
        # Sanity check
        print(f'Posts before {current_time}. Current data frame has {len(posts)} rows')
        
        # Time delay
        time.sleep(5)
    return pd.DataFrame(posts)

## r/UnresolvedMysteries Data

Using the function defined in [API request function](#api-request-function) we collect data from the r/UnresolvedMysteries subreddit, and save the data to a .csv file.

In [17]:
# Collect data from unresolved
data_unre = pushshift_query(subreddit='unresolvedmysteries', features=features)

200
Posts before 1642624516. Current data frame has 45 rows
200
Posts before 1642145243. Current data frame has 91 rows
200
Posts before 1641850357. Current data frame has 135 rows
200
Posts before 1641248913. Current data frame has 186 rows
200
Posts before 1640735538. Current data frame has 223 rows
200
Posts before 1640103281. Current data frame has 275 rows
200
Posts before 1639550426. Current data frame has 314 rows
200
Posts before 1638931619. Current data frame has 371 rows
200
Posts before 1638445664. Current data frame has 425 rows
200
Posts before 1637982245. Current data frame has 473 rows
200
Posts before 1637454153. Current data frame has 518 rows
200
Posts before 1637093682. Current data frame has 570 rows
200
Posts before 1636605978. Current data frame has 613 rows
200
Posts before 1636241666. Current data frame has 657 rows
200
Posts before 1635790968. Current data frame has 707 rows
200
Posts before 1635305063. Current data frame has 755 rows
200
Posts before 163482714

In [18]:
data_unre.head(2)

Unnamed: 0,author,awarders,created_utc,selftext,subreddit,title
0,TheBonesOfAutumn,[],1643236773,"On April 8th, 1981, 19-year-old David Huff dec...",UnresolvedMysteries,"In April of 1981, the body of 23-year-old Shar..."
1,Skoodilypoop666,[],1643233541,"In the fall of 2021, The small town of London ...",UnresolvedMysteries,“Whodunnit” the murder of 62 year old Bryan Mc...


In [19]:
# Check for NaNs
data_unre.isna().sum()

author          0
awarders        0
created_utc     0
selftext       26
subreddit       0
title           0
dtype: int64

In [20]:
# Verify amount of posts collected
data_unre.shape

(1009, 6)

### Save to CSV file.

In [21]:
data_unre.to_csv('../data/unresolved.csv')

## r/UnsolvedMysteries Data

Using the function defined in [API request function](#api-request-function) we collect data from the r/UnsolvedMysteries subreddit, and save the data to a .csv file.

In [22]:
data_unsol = pushshift_query(subreddit='unsolvedmysteries', features=features)

200
Posts before 1641614869. Current data frame has 100 rows
200
Posts before 1639025576. Current data frame has 200 rows
200
Posts before 1637214157. Current data frame has 300 rows
200
Posts before 1635298897. Current data frame has 400 rows
200
Posts before 1633840506. Current data frame has 500 rows
200
Posts before 1632218355. Current data frame has 599 rows
200
Posts before 1630570622. Current data frame has 699 rows
200
Posts before 1628351150. Current data frame has 799 rows
200
Posts before 1626159090. Current data frame has 899 rows
200
Posts before 1623929172. Current data frame has 999 rows
200
Posts before 1622509600. Current data frame has 1099 rows


In [23]:
data_unsol.head(2)

Unnamed: 0,author,awarders,created_utc,selftext,subreddit,title
0,amkakis,[],1643239264,,UnsolvedMysteries,An 18 year old leaves home to retrieve a purse...
1,Once_a_TQ,[],1643227462,,UnsolvedMysteries,Search continues for retired Cape Breton veter...


In [24]:
data_unsol.isna().sum()

author         0
awarders       0
created_utc    0
selftext       0
subreddit      0
title          0
dtype: int64

### Save to CSV file.

In [25]:
data_unsol.to_csv('../data/unsolved.csv')

## Check data for 'removed' posts

In [26]:
data_unre[data_unre['selftext'] == '[removed]'].shape

(0, 6)

In [27]:
data_unsol[data_unsol['selftext'] == '[removed]'].shape

(0, 6)

## Learned during EDA that UnsolvedMysteries posts do not have selftext content
Will have to choose different subreddits? Let's try to model on title only.