# Data Collection, Cleaning & Feature Engineering

---

***Data Science Problem:***
> Using comments and submissions from the `r/history` and `r/Futurology` subreddits, can we build a model that can tell the difference between a conversation about the future and a conversation about the past?

---

## Notebook Contents:
- [Imports](#imports)
- [Using the Pushshift API](#api)
- [Cleaning & Feature Engineering](#cleaning-and-fe)

<a id='imports'></a>

## Imports

---

In [1]:
import requests

import numpy as np
import pandas as pd

import time

<a id='api'></a>

## Using the Pushshift API

---

The `get_reddit_data` function below takes in three arguments:
- `subreddit`: The subreddit from which the data is to be scraped. We will be using the `'history'` and `'Futurology'` subreddits.
- `endpoint`: The type of data to scrape; either `'comment'` or `'submission'`.
- `n_iter`: The number of times the API will run. Because we are limited to 1,000 posts per scrape, `n_iter` allows us to scrape `n_iter * 1,000` posts at a time.

In [2]:
# Adapted from code written by Tim Book with inspiration courtesy of Matt Burke

def get_reddit_data(subreddit, endpoint, n_iter):
    
    # list to store DataFrames of Scraped posts
    df_list = []
    
    # we will scrape posts older than Tuesday, April 21, 2020 8:25:52 PM
    current_time = 1587500752
    
    for _ in range(n_iter):
        
        # scrape comments
        if str(endpoint) == 'comment':
            res = requests.get(
                'https://api.pushshift.io/reddit/search/comment',
                params = {
                    'subreddit': subreddit,
                    'size': 1000,
                    'lang': True,
                    'before': current_time
                }
            )
            
            # put the scraped comments into a DataFrame
            df = pd.DataFrame(res.json()['data'])
            
            # put each comment DataFrame into a list
            df_list.append(df)
        
        # scrape submissions
        if str(endpoint) == 'submission':
            res = requests.get(
                'https://api.pushshift.io/reddit/search/submission',
                params = {
                    'subreddit': subreddit,
                    'size': 1000,
                    'lang': True,
                    'before': current_time
                }    
            )
            
            # put the scraped submissions into a DataFrame
            df = pd.DataFrame(res.json()['data'])
            
            # put each submission DataFrame into a list
            df_list.append(df)            
        
        # reset current_time to the time of the oldest scraped post
        current_time = df.created_utc.min()
    
    # return full DataFrame of all scraped posts
    return pd.concat(df_list, axis=0)

In [3]:
%%time

# get 10,000 posts from each category, totaling 40,000 comments and submissions from both subreddits

futurology_comments = get_reddit_data('futurology', 'comment', 10)
futurology_submissions = get_reddit_data('futurology', 'submission', 10)
history_comments = get_reddit_data('history', 'comment', 10)
history_submissions = get_reddit_data('history', 'submission', 10)

CPU times: user 4.14 s, sys: 500 ms, total: 4.64 s
Wall time: 2min 46s


In [4]:
futurology_comments.head(2)

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,total_awards_received,treatment_tags,edited,author_cakeday
0,[],,AutoModerator,,bot red,[],,Putting mods out of work,dark,text,...,1587505576,1,False,False,Futurology,t5_2t7no,0,[],,
1,[],,user1688,,,[],,,,text,...,1587505565,1,True,False,Futurology,t5_2t7no,0,[],,


In [5]:
futurology_submissions.head(2)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_text,author_flair_text_color,awarders,can_mod_post,contest_mode,...,preview,crosspost_parent,crosspost_parent_list,author_flair_template_id,edited,author_cakeday,banned_by,gilded,poll_data,steward_reports
0,[],False,[deleted],,,,dark,[],False,False,...,,,,,,,,,,
1,[],False,Alan_Hill,,,,,[],False,False,...,"{'enabled': False, 'images': [{'id': 'IoN4chmr...",,,,,,,,,


In [6]:
history_comments.head(2)

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,score,send_replies,stickied,subreddit,subreddit_id,total_awards_received,treatment_tags,distinguished,edited,author_cakeday
0,[],,legatii,,,[],,,,text,...,1,True,False,history,t5_2qh53,0,[],,,
1,[],,PlebasRorken,,,[],,,,text,...,1,True,False,history,t5_2qh53,0,[],,,


In [7]:
history_submissions.head(2)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_text,author_flair_text_color,awarders,banned_by,can_mod_post,...,link_flair_template_id,link_flair_text,edited,author_cakeday,gilded,distinguished,author_flair_template_id,crosspost_parent,crosspost_parent_list,steward_reports
0,[],False,[deleted],,,,dark,[],moderators,False,...,,,,,,,,,,
1,[],False,robocroc,,,,,[],,False,...,,,,,,,,,,


<a id='cleaning-and-fe'></a>

## Cleaning & Feature Engineering

---


In [8]:
# each DataFrame only needs certain columns depending on the category of the post
# the comment DataFrames will use 'author', 'body', and 'subreddit'
# the submission DataFrames will use 'author', 'title', and 'subreddit'

futurology_comments = futurology_comments[['author', 'body', 'subreddit']].copy()
futurology_submissions = futurology_submissions[['author', 'title', 'subreddit']].copy()
history_comments = history_comments[['author', 'body', 'subreddit']].copy()
history_submissions = history_submissions[['author', 'title', 'subreddit']].copy()

In [9]:
# inorder for our DataFrames to be effectively combined, we will rename both the 'body' and 'title' columns 'text'

text_column = {'body': 'text', 'title': 'text'}

futurology_comments.rename(columns=text_column, inplace=True)
futurology_submissions.rename(columns=text_column, inplace=True)
history_comments.rename(columns=text_column, inplace=True)
history_submissions.rename(columns=text_column, inplace=True)

In [10]:
# combine the four 10,000 row DataFrames into one large 40,000 row DataFrame

reddit_data = pd.concat([futurology_comments, futurology_submissions, history_comments, history_submissions])

In [11]:
reddit_data.shape

(40000, 3)

In [12]:
reddit_data.head(2)

Unnamed: 0,author,text,subreddit
0,AutoModerator,"Hello, /u/Woofislove! Thank you for your parti...",Futurology
1,user1688,Taking away freedom after a crisis. This is tr...,Futurology


In [13]:
reddit_data.tail(2)

Unnamed: 0,author,text,subreddit
998,aaHBN,Walls of the Sasanian Persian Empire - whateve...,history
999,Tyson2BaldFury,What important political repercussions did the...,history


In [14]:
# convert our target 'subreddit' column into values that our models will be able to predict
# 1 for Futurology 
# 0 for history

reddit_data['subreddit'] = reddit_data['subreddit'].map({'history': 0, 'Futurology': 1})

In [15]:
reddit_data.head(2)

Unnamed: 0,author,text,subreddit
0,AutoModerator,"Hello, /u/Woofislove! Thank you for your parti...",1
1,user1688,Taking away freedom after a crisis. This is tr...,1


In [16]:
reddit_data.tail(2)

Unnamed: 0,author,text,subreddit
998,aaHBN,Walls of the Sasanian Persian Empire - whateve...,0
999,Tyson2BaldFury,What important political repercussions did the...,0


In [17]:
# drop all rows that contain an 'author' that has been deleted

reddit_data = reddit_data.loc[(reddit_data['author'] != '[deleted]')].copy()

In [18]:
# make sure that the above action also dropped all rows from which posts had been removed

reddit_data.loc[reddit_data['text'] == '[removed]']

Unnamed: 0,author,text,subreddit


In [21]:
reddit_data.to_csv('./data/reddit_data.csv', index=False)