### Notebook 1: API calls, Data Cleaning & EDA
We are classifying posts from the 2 subreddits of Books and Fantasy Books.

[Loading Data](#load_data)


[Data Cleaning & EDA](#data_cleaning_eda)

### Importing Libraries


In [1]:
import requests
from bs4 import BeautifulSoup
import re
import random
import time

import pandas as pd
import numpy as np

<a id='load_data'></a>

### Load Data

In [2]:
url = 'https://www.reddit.com/r/Fantasy.json'
url_sf = 'https://www.reddit.com/r/printSF.json'

In [None]:
# check that requests are valid
print(res.status_code)
print(res_sf.status_code)

In [4]:
# call the reddit API for ~1250 posts in the Fantasy subreddit
posts = []
after = None         

for a in range(5):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': '12'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('train_fantasy.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('train_fantasy.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)
    
pd.DataFrame(posts).to_csv('train_f.csv')

https://www.reddit.com/r/Fantasy.json
3
https://www.reddit.com/r/Fantasy.json?after=t3_evutip
6
https://www.reddit.com/r/Fantasy.json?after=t3_eveacd
4
https://www.reddit.com/r/Fantasy.json?after=t3_evbhy3
3
https://www.reddit.com/r/Fantasy.json?after=t3_ev7fnc
5


In [None]:
# call the reddit API for ~1250 posts in the Books subreddit
posts = []
after = None         

for a in range(40):
    if after == None:
        current_url = url_sf
    else:
        current_url = url_sf + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': '18'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('train_sf.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('train_sf.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)
    
pd.DataFrame(posts).to_csv('train_sf.csv')

In [5]:
# read in the fantasy and books.csv files
train_f = pd.read_csv('train_fantasy.csv')
train_sf = pd.read_csv('train_sf.csv')

<a id='data_cleaning_eda'></a>

# Data cleaning & EDA: 
train_sf has 990 posts and 104 columns, train_fantasy.csv has 1244 posts with 110 columns.
All the columns were examined and only 3 columns were kept: 'title' and 'selftext' columns which is the content of the post, and the 'subreddit' column to identify which post the subreddit is from. 

The 'stickied' column was used to identify and remove duplicate posts. Sticky posts are basically permanent posts left on the front page of the subreddit - they could be ongoing polls, or announcements.

The remaining columns was deemed not useful as they contained information like thumbnails/handle of the redditor which is not relevant to classifying whether a post belongs to Fantasy Books or Books.

Posts with empty 'selftext' columns were dropped as they do not contain enough text to perform the classifier on.

In [6]:
# EDA
print(train_f.shape)
train_sf.shape

(1244, 110)


(990, 104)

In [7]:
train_sf = train_sf[train_sf['stickied'] == False] # dropping 4 rows of 'stickied' posts
train_f = train_f[train_f['stickied'] == False] # dropping 2 rows of 'stickied' posts
print(train_f.shape)
train_sf.shape


(1242, 110)


(986, 104)

In [8]:
#remove posts with empty selftext, duplicate posts 

train_f = train_f[['title', 'selftext', 'subreddit']]
train_sf = train_sf[['title', 'selftext', 'subreddit']]

train_sf.dropna(axis = 0, inplace = True)
train_sf.info()

train_f.dropna(axis = 0, inplace = True)
train_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 853 entries, 2 to 989
Data columns (total 3 columns):
title        853 non-null object
selftext     853 non-null object
subreddit    853 non-null object
dtypes: object(3)
memory usage: 26.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1048 entries, 2 to 1243
Data columns (total 3 columns):
title        1048 non-null object
selftext     1048 non-null object
subreddit    1048 non-null object
dtypes: object(3)
memory usage: 32.8+ KB


In [12]:
# Removing repeated posts where content is the same. the titles of repeated posts start with /r/Fantasy
repeats = train_f[train_f['title'].str.contains('/r/Fantasy')]
repeats
train_f.drop(index = repeats.index, inplace = True)


Unnamed: 0,title,selftext,subreddit
2,/r/Fantasy Review Tuesday - Review what you're...,The weekly Tuesday Review Thread is a great pl...,Fantasy
3,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
28,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
56,/r/Fantasy Self-Promotion Thread,This biweekly self-promotion is the place for ...,Fantasy
86,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
117,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
149,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
153,"/r/Fantasy General Discussion January 24, 2020","Come tell the community what you're reading, h...",Fantasy
195,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy
223,/r/Fantasy - Daily Recommendation Requests and...,This thread is to be used for recommendation r...,Fantasy


In [45]:
# Scifi Subreddit: checking for duplicate posts by title, e.g. automated posting everyday asking for recommendations. 21 duplicates
duplicates_sf = train_sf[train_sf.duplicated()].sort_values('title')
duplicates_sf.shape
train_sf = train_sf.drop_duplicates(subset = 'title')
train_sf.shape

(832, 3)

In [46]:
# joining the Fantasy and Scifi dataframes together, and merging the title with the selftext column, converting the Fantasy subreddit to target variable
combined = train_f.append(train_sf)
combined['content'] = combined['title'] + combined['selftext']
combined = combined[['content', 'subreddit']]
combined.reset_index(inplace = True)

combined['subreddit_Fantasy'] = combined['subreddit'].map({'Fantasy': 1, 'printSF': 0}) 
combined.drop(columns = ['subreddit', 'index'], inplace = True)
combined.head()

Unnamed: 0,content,subreddit_Fantasy
0,Are there more books planned for the Licanius ...,1
1,Grimdark coming of age booksI have read many b...,1
2,Fantasy tropes from another POVReading Expanse...,1
3,How long does it take to get into the Wanderin...,1
4,What's some dark/gothic fantasy that is *not* ...,1


In [47]:
# saving out the combined dataframe.
pd.DataFrame(combined).to_csv('cleaned_data.csv')