# General Assembly DSI 13 EC 
# Project 3 - Web APIs & NLP
## Mike Bell 
### October 23, 2020

## Notebook 2: Data Preprocessing

In this notebook, we aggregate, clean, and process the subreddit text post scrapes obtained in Notebook 1. 

In particular, we delete any empty posts, removed posts, and stickied posts. Most reddit posts seem to have no body text ('selftext') and we replace these missing values with empty strings. 

We also remove any capitalization, html generated symbols (ex: `&amp;`), hyperlinks, common stopwords etc.

Finally NLP packages are used to lemmatize the combined title/body text, and the data is saved to file

In [264]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For NLP operations such as tokenization, lemmatization, and stopword removal
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.options.display.max_colwidth = None

### Aggregate scraped dataframes into two large dataframes based on subreddit

In [265]:
subreddits = [ 'math', 'physics']

subreddit_dir = f'../data/{subreddits[0]}_{subreddits[1]}_data/'


In [315]:
num_files = 12 # how many files to include in aggregation

# some subreddits include many spam/removed posts, so there will be an inbalance in the 
# number of cleaned examples in the dataset. We use the following parameters to include extra
# files to balance the dataset. 
extra_files = 6
unbalanced_indx = 0   # which subreddit (0 or 1) to add the extra files to

# two lists to track each csv as a dataframe, will be combined at the end
subreddit_dfs = [[],[]]

for i in range(num_files):
    for idx, sub in enumerate(subreddits): 
        subreddit_dfs[idx].append(pd.read_csv(f'{subreddit_dir}{sub}_{i}.csv'))


# read in extra files, if needed
for i in range(num_files, num_files + extra_files):
    subreddit_dfs[unbalanced_indx].append(pd.read_csv(f'{subreddit_dir}{subreddits[unbalanced_indx]}_{i}.csv'))

# combine into two dataframes
df_0 = pd.concat(subreddit_dfs[0], axis = 0)
df_1 = pd.concat(subreddit_dfs[1], axis = 0)

df_0.reset_index(inplace = True, drop = True)
df_1.reset_index(inplace = True, drop = True)

# write aggregated dataframes to files
df_0.to_csv(f'{subreddit_dir}{subreddits[0]}_agg.csv', index = False)
df_1.to_csv(f'{subreddit_dir}{subreddits[1]}_agg.csv', index = False)

### Clean the data: Remove spam/deleted/stickied posts, fill NaNs

In [316]:
# Deletes any posts which have been flagged as being removed
# also removes any missing a title, or stickied

# Replace any NaNs in selftext with empty strings, and return only the text/subreddit columns
def clean_reddit_data(df):
    
    df = df[df['removed_by_category'].isnull()] # delete any removed posts
    df = df[~df['title'].isnull()] # delete empty titled posts
    df = df[df['stickied'] == False] # delete stickied posts
    df.fillna({'selftext': ''}, inplace = True) # replace NaN selftext with empty strings

    return df[['title', 'selftext', 'subreddit']].drop_duplicates()

In [317]:
# this is a list of words to be removed 
# consists of all stopwords, and stopwords with punctuation (apostrophes) removed as these show up often
remove_words = list(set(stopwords.words('english') +  \
                        [x.replace("'", "") for x in stopwords.words('english')]))


tokenizer = RegexpTokenizer(r"\w+")

# tokenize text and remove stopwords and any
def clean_text(text):
    text_tokens = tokenizer.tokenize(text)
    return ' '.join([word for word in text_tokens if ((word not in remove_words) & (word.isalpha()))])


# does all the steps mentioned above and returns a combined 'text' column consisting of all text in each post
def process_text(df):
    
    df = df.copy()
    
    # convert to lowercase
    df['title'] = df['title'].str.lower()
    df['selftext'] = df['selftext'].str.lower()
    df['subreddit'] = df['subreddit'].str.lower()
    
    
    # First remove html converted symbols and hyperlinks from title and selftext
    df['title'] = df['title'].str.replace(r'&\w*;', '')
    df['selftext'] = df['selftext'].str.replace(r'&\w*;', '') 
    df['title'] = df['title'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True)\
                 .replace('#\S+', '', regex=True).replace('\n\n\S+', '', regex=True)
    df['selftext'] = df['selftext'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True)\
                 .replace('#\S+', '', regex=True).replace('\n\n\S+', '', regex=True)
    
    
    # apply stopword/symbol remover
    df['title'] = df['title'].map(clean_text)
    df['selftext'] = df['selftext'].map(clean_text)
    
    # remove digits
    df['title'] = df['title'].str.replace(r'\d+', '')
    df['selftext'] = df['selftext'].str.replace(r'\d+', '')
    
    
    # join title and selftext into a single feature
    df['text'] = df['title'] + ' ' + df['selftext']
    df = df[df['text'].str.strip() != '']  # check for any empty text entries
    
    return df[['title', 'selftext', 'text', 'subreddit']].drop_duplicates()

In [318]:
df_0 = clean_reddit_data(df_0)
df_1 = clean_reddit_data(df_1)

In [319]:
df_1.isnull().sum()

title        0
selftext     0
subreddit    0
dtype: int64

In [320]:
df_0 = process_text(df_0)
df_1 = process_text(df_1)

In [321]:
# combine to one dataframe
df = pd.concat([df_0, df_1], axis = 0)
df.reset_index(inplace = True, drop = True)

# convert subreddit column to binary
df['subreddit'] = df['subreddit'].map({subreddits[0] : 0, subreddits[1] : 1})

In [322]:
# check balance of classes, go back to start and adjust extra_files parameters if needed
df.subreddit.value_counts()

1    1083
0    1057
Name: subreddit, dtype: int64

In [323]:
# Write to csv
df.to_csv(f'{subreddit_dir}{subreddits[0]}_{subreddits[1]}_combined.csv', index = False)

In [324]:
lemmatizer = WordNetLemmatizer()

In [325]:
# Lemmatize
def lem_text(text):
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words])


In [326]:
# stem/lem the text column, as this is the column we will be training models on 
df['lem_text'] = df['text'].apply(lem_text)

In [327]:
df = df[['title', 'selftext', 'text', 'lem_text', 'subreddit']]

In [328]:
df.isnull().sum()

title        0
selftext     0
text         0
lem_text     0
subreddit    0
dtype: int64

In [329]:
# write to file
df.to_csv(f'{subreddit_dir}{subreddits[0]}_{subreddits[1]}_combined_lem.csv', index = False)

In [330]:
df.shape

(2140, 5)

In [331]:
df['subreddit'].value_counts(normalize=True)

1    0.506075
0    0.493925
Name: subreddit, dtype: float64