# Data Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiaowens/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiaowens/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Import reddit dataset as csv:

#### (from 'Salvaging the Internet Hate Machine: Using the discourse of extremist online subcultures to identify emergent extreme speech'; 2-20-2020; Peeters, Stijn; Hagen, Sal; Das, Partha; University of Amsterdam)

# Note-- the reddit csv I read in, below, is too large to upload to github, so I am including a link to the site from which I found the data. If you have pulled this notebook down from github and plan to run it, please go to this link: https://zenodo.org/record/3676483#.YbBO6PHMLfF and download the reddit-dataset.csv. The below command will not work until you have downloaded that csv into the data folder. 

# All the other csvs created in the course of these notebooks (with the exception of adjectives.csv, which is only 7KB) have not been included in the data folder because they were also too large to be accepted by github -- they need to be created dynamically by running the cells of the notebook.

In [2]:
subs = pd.read_csv('data/reddit-dataset.csv')

## Initial check of size and content:

In [3]:
subs.shape

(3618557, 2)

In [4]:
subs.head()

Unnamed: 0,body,subreddit
0,die hard trump supporter but i dont think thos...,The_Donald
1,His family begged them to intervene.,The_Donald
2,,ChapoTrapHouse
3,my man sitting next to trumps right is so on p...,ChapoTrapHouse
4,Those are good reasons. Canada is a great coun...,politics


In [8]:
subs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3618557 entries, 0 to 3618556
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   body       object
 1   subreddit  object
dtypes: object(2)
memory usage: 55.2+ MB


## The 'subreddit' column will be the y value-- get a sense of the breakdown in the dataset of the various subreddits each post (row) comes from:

In [5]:
subs['subreddit'].value_counts()

politics          2379546
The_Donald         878217
ChapoTrapHouse     348552
TheRedPill          12242
Name: subreddit, dtype: int64

In [6]:
subs['subreddit'].value_counts(normalize=True)

politics          0.657595
The_Donald        0.242698
ChapoTrapHouse    0.096323
TheRedPill        0.003383
Name: subreddit, dtype: float64

## This is a very large dataset (more than 3.5 million posts)

## Get a sample of the dataframe, download to csv in order to get an idea of the content:

In [23]:
sample = subs.sample(500)

In [24]:
sample.to_csv('./data/sample.csv', index=False)

## Delete any rows in the dataset with '[removed]' or '[deleted]' tags:

In [7]:
subs.loc[subs['body'] == '[removed]'].count()

body         175131
subreddit    175131
dtype: int64

In [8]:
subs.loc[subs['body'] == '[deleted]'].count()

body         23503
subreddit    23503
dtype: int64

## Create a sub-dataframe with the above types of rows removed:

In [9]:
clean = subs.loc[subs['body'] != '[removed]']

In [10]:
clean.shape

(3443426, 2)

In [11]:
clean = clean.loc[clean['body'] != '[deleted]']

In [12]:
clean.shape

(3419923, 2)

## Drop all rows with empty/NAN values:

In [13]:
clean.isna().sum()

body         118824
subreddit         0
dtype: int64

In [14]:
clean.dropna(inplace=True)

In [15]:
clean.isna().sum()

body         0
subreddit    0
dtype: int64

In [16]:
clean.shape

(3301099, 2)

In [17]:
clean = clean.reset_index(drop=True)

## After dropping all observations with empty or uninformative content, the cleaned dataset still has more than three million values.

## Delete any rows in the dataset that were created by bots:

In [18]:
discard = ["I am a bot, and this action was performed automatically"]
clean = clean[~clean.body.str.contains('|'.join(discard))]

# from:
# https://www.statology.org/pandas-drop-rows-that-contain-string/

In [19]:
clean.shape

# lost about 200_000 rows with this, still over three million left in the dataset

(3196580, 2)

## Create a new column with the word count of the associated post content for each row:

In [20]:
clean['word_count'] = clean['body'].map(lambda x: len(x.split(' ')))

# from:
# Katie Sylvia Breakfast Hour week6 NLP Practice

In [21]:
clean.head()

Unnamed: 0,body,subreddit,word_count
0,die hard trump supporter but i dont think thos...,The_Donald,13
1,His family begged them to intervene.,The_Donald,6
2,my man sitting next to trumps right is so on p...,ChapoTrapHouse,11
3,Those are good reasons. Canada is a great coun...,politics,9
4,Oy! The Father of Lies title is taken.\nour lo...,politics,22


## Create another sub-dataframe from the cleaned dataset with just the 'ChapoTrapHouse', 'The_Donald' and 'TheRedPill' subreddits, as these are the ones most likely with extremist language. (The 'politics' subreddit likely has extreme language as well but is interspersed with normal language, and after looking it over I would not classify it as an extremist subreddit.)

In [22]:
extrem = clean[clean['subreddit'] != 'politics']

In [23]:
extrem.shape

(1050497, 3)

In [24]:
extrem['subreddit'].value_counts()

The_Donald        720476
ChapoTrapHouse    319872
TheRedPill         10149
Name: subreddit, dtype: int64

In [25]:
extrem = extrem.reset_index(drop=True)

## Create a sub-dataframe from the cleaned dataset with just the 'politics' subreddit content, potentially to be combined into the model's X as a non-extremist category.

In [26]:
non_extrem = clean[clean['subreddit'] == 'politics']

In [27]:
non_extrem.shape

(2146083, 3)

In [28]:
non_extrem = non_extrem.reset_index(drop=True)

## Create functions to pre-process the content of the 'body' column-- the actual subreddit posts:

In [65]:
# create a function to tokenize, remove stop words, punctuation and special chars-

def nlp_tokenize(content):
    # instantiate tokenizer and stemmer-
    p_stem = PorterStemmer()
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+") # Katie Sylvia Breakfast Hour week6 NLP Practice
    
    # tokenize words and make lower case-
    words = my_tokenizer.tokenize(content.lower())

    # remove stop words-
    sw_list = stopwords.words('english')
    non_stop = [word for word in words if word not in sw_list] 
    
    return ' '.join(non_stop)
    
# from:
# Katie Sylvia Breakfast Hour week6 NLP Practice

In [68]:
# create a function to stem the words in addition to all the other processing, above-

def nlp_stem(content):
    # instantiate tokenizer and stemmer-
    p_stem = PorterStemmer()
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+") # Katie Sylvia Breakfast Hour week6 NLP Practice
    
    # tokenize words and make lower case-
    words = my_tokenizer.tokenize(content.lower())

    # remove stop words-
    sw_list = stopwords.words('english')
    non_stop = [word for word in words if word not in sw_list] 
    
    # stem-
    stemmed_list = [p_stem.stem(word) for word in non_stop]

    # stitch the list back into a string and return-
    return ' '.join(stemmed_list)
    
# from:
# Katie Sylvia Breakfast Hour week6 NLP Practice

In [73]:
# run extrem df thru nlp preprocessing function, creating new tokenized column-

extrem['tokenized'] = extrem['body'].map(nlp_tokenize)

In [74]:
# run extrem df thru the stemming function, creating new clean_content column-

extrem['clean_content'] = extrem['body'].map(nlp_stem)

# these both took a really long time so I'm going to hold off on running it across the entire non_extrem dataset for now

In [76]:
extrem.head()

Unnamed: 0,body,subreddit,word_count,tokenized,clean_content
0,die hard trump supporter but i dont think thos...,The_Donald,13,die hard trump supporter dont think videos amo...,die hard trump support dont think video amount...
1,His family begged them to intervene.,The_Donald,6,family begged intervene,famili beg interven
2,my man sitting next to trumps right is so on p...,ChapoTrapHouse,11,man sitting next trumps right point,man sit next trump right point
3,What a bitch,ChapoTrapHouse,3,bitch,bitch
4,We'll have advance notice of them planning to ...,The_Donald,28,we'll advance notice planning attack children ...,we'll advanc notic plan attack children wealth...


## Find the 50 observations in the extrem dataset with the highest word counts, check for junk rows that you can remove:

In [81]:
extrem.sort_values(['word_count'], ascending=False)[['subreddit', 'clean_content', 'word_count']].head(50)

Unnamed: 0,subreddit,clean_content,word_count
32554,ChapoTrapHouse,ok boomer ok boomer ok boomer ok boomer ok boo...,8004
1011207,ChapoTrapHouse,john galt speech ayn rand atla shrug twelv yea...,7070
409483,ChapoTrapHouse,ii izw wx ix wmnz zznxmw w w mn zm w nw x w ii...,6643
903805,ChapoTrapHouse,neoliber exist concept within academ circl nea...,4270
536470,The_Donald,trump' accomplish creat histor econom boom due...,4186
78533,ChapoTrapHouse,1929 cultur lost secretari state henri stimson...,4177
76049,ChapoTrapHouse,metal gear solid militari masculin homoerotica...,3700
1043375,The_Donald,perpetu polit institut address young men' lyce...,3526
700621,TheRedPill,mayb bit like social event parti gather wed et...,3477
884341,TheRedPill,decid pull nbsp talk girl attract determin ide...,2923


In [82]:
# delete any junk rows with large outlying word counts-

rows = extrem.index[[32554, 409483, 256400, 179602, 891236, 289727]]

extrem = extrem.drop(rows)

# from:
# https://www.kite.com/python/answers/how-to-drop-a-list-of-rows-from-a-pandas-dataframe-by-index-in-python

In [83]:
extrem = extrem.reset_index(drop=True)

In [84]:
extrem.shape

(1050491, 6)

## Do the same for the non_extrem dataset:

In [85]:
non_extrem.sort_values(['word_count'], ascending=False)[['subreddit', 'body', 'word_count']].head(50)

# no junk rows to remove in this list

Unnamed: 0,subreddit,body,word_count
1938809,politics,RE: polito’s rightward beltway tilt\nI hear yo...,1817
1516401,politics,",\nThailand is no longer a country of rice far...",1707
1721010,politics,>There is not a very large financial barrier t...,1651
1798499,politics,"**Trump's record on the military, veterans, an...",1644
464098,politics,> Who gets to design the protocol and platform...,1643
1912624,politics,"**Trump's record on the military, veterans, an...",1642
794030,politics,https://www.nytimes.com/2000/08/09/nyregion/br...,1637
1524827,politics,Feb. 22 2017: \nProPublica’s Raymond Bonner r...,1635
2092632,politics,For when they talk about how civies would get ...,1631
1672685,politics,1). They seize the lands of sovereign nations ...,1620


## Since this is such a large dataset, create a subset of just 40_000 observations, split evenly between non-extremist and extremist subreddits:

In [86]:
extrem_subset = extrem.sample(20000)

In [87]:
non_extrem_subset = non_extrem.sample(20000)

In [88]:
# run the non_extrem_subset through the two NLP cleaning/stemming functions so that its columns match
# the extrem subset-

non_extrem_subset['tokenized'] = non_extrem_subset['body'].map(nlp_tokenize)

In [89]:
non_extrem_subset['clean_content'] = non_extrem_subset['body'].map(nlp_stem)

## Add a column marking whether the row in question is from an extremist subreddit (1 for any from 'ChapoTrapHouse', 'The_Donald' and 'TheRedPill' subreddits, 0 for any from the 'politics' subreddit)

In [98]:
extrem_subset['extreme'] = 1

In [99]:
non_extrem_subset['extreme'] = 0

In [100]:
non_extrem_subset.columns

Index(['body', 'subreddit', 'word_count', 'tokenized', 'clean_content',
       'extreme'],
      dtype='object')

In [101]:
extrem_subset.columns

Index(['body', 'subreddit', 'word_count', 'tokenized', 'clean_content',
       'extreme'],
      dtype='object')

In [102]:
# concatenate the two subsets into one df-

subreddits_full = pd.concat([extrem_subset, non_extrem_subset])

In [104]:
subreddits_full.shape

(40000, 6)

In [103]:
# shuffle-

subreddits = subreddits_full.sample(frac=1).reset_index(drop=True)

# from:
# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

In [105]:
subreddits.shape

(40000, 6)

In [107]:
subreddits.head(10)

Unnamed: 0,body,subreddit,word_count,tokenized,clean_content,extreme
0,- Official slogan of Pepsi,politics,5,official slogan pepsi,offici slogan pepsi,0
1,"""took"" should be ""freed""",ChapoTrapHouse,4,took freed,took freed,1
2,Will tribes be completely free speech?,TheRedPill,6,tribes completely free speech,tribe complet free speech,1
3,Not an accident. They have crowned their queen...,The_Donald,25,accident crowned queen already warren others g...,accid crown queen alreadi warren other get pla...,1
4,Fucking love this man,The_Donald,4,fucking love man,fuck love man,1
5,We're well past the point of embarassment.\nAt...,The_Donald,25,we're well past point embarassment point kind ...,we'r well past point embarass point kind perve...,1
6,I absolutely cannot wait till the lasers get f...,The_Donald,17,absolutely cannot wait till lasers get focused...,absolut cannot wait till laser get focus warre...,1
7,She offered CTU a contract that would put a nu...,politics,43,offered ctu contract would put nurse social wo...,offer ctu contract would put nurs social worke...,0
8,Ha i get it now...it is funny.,The_Donald,7,ha get funny,ha get funni,1
9,> RICO \nRepublicans In Cells Only?,politics,6,rico republicans cells,rico republican cell,0


## Export the datasets as csvs for further processing in subsequent notebooks:

In [108]:
subreddits.to_csv('./data/subreddits.csv', index=False)

In [109]:
extrem.to_csv('./data/extreme.csv', index=False)

In [110]:
# non_extrem.to_csv('./data/nonextreme.csv', index=False)

In [111]:
clean.to_csv('./data/reddit_clean.csv', index=False)