# ASOIAF/GoT Reddit Posts - Pt. 2

># EDA

### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

### Load Data

In [2]:
## loading in all of our data
day_1 = pd.read_csv('./data/day_1_reddit_scraping_indexed.csv')
day_2 = pd.read_csv('./data/day_2_reddit_scrapping.csv')
day_3 = pd.read_csv('./data/day_3_reddit_scraping.csv')
day_4 = pd.read_csv('./data/day_4_reddit_scraping.csv')
day_5 = pd.read_csv('./data/day_5_reddit_scraping.csv')
day_6 = pd.read_csv('./data/day_6_reddit_scraping.csv')
day_7 = pd.read_csv('./data/day_7_reddit_scraping.csv')

In [3]:
## concatenating all our datasets
data = pd.concat([day_1, day_2, day_3, day_4, day_5, day_6, day_7], ignore_index=True)

In [4]:
## viewing the shape of our data, we'll need to drop our indexed column
data.shape

(23771, 3)

In [5]:
## looking at our initial data
data.head()

Unnamed: 0.1,Unnamed: 0,post_text,subreddit
0,0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,1,It's happened to all of us.\n\nYou come across...,asoiaf
2,2,Something thats always bothered me is Tywin's ...,asoiaf
3,3,Apologies if this has been posted before.\n\nI...,asoiaf
4,4,One of the things I was sorry not to get more ...,asoiaf


In [6]:
## getting rid of the "unnamed" column
data.drop("Unnamed: 0", axis=1, inplace = True)

In [7]:
## locating the duplicate text in our posts
data = data.loc[pd.DataFrame.duplicated(data) == False]

In [8]:
## verifying that there is no more duplicates
pd.DataFrame.duplicated(data).value_counts()

False    1542
dtype: int64

In [9]:
## showing corrected header
data.head()

Unnamed: 0,post_text,subreddit
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,It's happened to all of us.\n\nYou come across...,asoiaf
2,Something thats always bothered me is Tywin's ...,asoiaf
3,Apologies if this has been posted before.\n\nI...,asoiaf
4,One of the things I was sorry not to get more ...,asoiaf


In [10]:
## checking our value counts
data.subreddit.value_counts()

asoiaf           939
gameofthrones    603
Name: subreddit, dtype: int64

### Pre-processing

In [11]:
## no null values
data.isnull().sum().sum()

0

In [12]:
## viewing the shape of our data
data.shape

(1542, 2)

In [13]:
data.describe()

Unnamed: 0,post_text,subreddit
count,1542,1542
unique,1534,2
top,"Remember one year ago, back in May 2018, [Emil...",asoiaf
freq,2,939


### Attempt at Visualizing with Spacy / Scattertext

In [14]:
import scattertext as st
import re, io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("&lt;style>.container { width:98% !important; }&lt;/style>"))

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from scattertext import CorpusFromPandas, produce_scattertext_explorer

In [15]:
## loading in spaCy
nlp = spacy.load('en_core_web_sm')

In [16]:
## creating our parsed/tokenized text
data['parsed_post'] = data['post_text'].apply(nlp)

In [17]:
## showing new column
data.head()

Unnamed: 0,post_text,subreddit,parsed_post
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf,"(Welcome, to, the, Weekly, Q, &, amp, ;, A, !,..."
1,It's happened to all of us.\n\nYou come across...,asoiaf,"(It, 's, happened, to, all, of, us, ., \n\n, Y..."
2,Something thats always bothered me is Tywin's ...,asoiaf,"(Something, that, s, always, bothered, me, is,..."
3,Apologies if this has been posted before.\n\nI...,asoiaf,"(Apologies, if, this, has, been, posted, befor..."
4,One of the things I was sorry not to get more ...,asoiaf,"(One, of, the, things, I, was, sorry, not, to,..."


In [18]:
## loading spacy stop_words
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 305
First ten stop words: ['again', 'itself', 'forty', 'never', 'mostly', 'becomes', 'whither', 'does', 'must', 'above']


In [20]:
## creating a corpus to use for our scattertext plot
corpus = st.CorpusFromPandas(data, category_col='subreddit', text_col='post_text', nlp=nlp).build()

In [21]:
## The most frequent terms in our corpus
list(corpus.get_scaled_f_scores_vs_background().index[:10])

['daenerys',
 'tyrion',
 'dany',
 'cersei',
 'westeros',
 'grrm',
 'sansa',
 'arya',
 'stannis',
 'targaryen']

In [22]:
## turning our scattertext corpus into a dataframe
term_freq_df = corpus.get_term_freq_df() 

## creating scaled scores for 'asoiaf'
term_freq_df['asoiaf_scaled'] = corpus.get_scaled_f_scores(category='asoiaf') 

## creating scaled scores for 'got'
term_freq_df['got_scaled'] = corpus.get_scaled_f_scores(category='gameofthrones') 

## showing the top scaled 'asoiaf' values
term_freq_df.sort_values(by='asoiaf_scaled', ascending=False).head(10)

Unnamed: 0_level_0,asoiaf freq,gameofthrones freq,asoiaf_scaled,got_scaled
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gt &,248,0,1.0,0.0
gt,512,1,0.999463,0.000537
& gt,509,1,0.99946,0.00054
_ _,2765,112,0.987778,0.012222
_,2952,122,0.987524,0.012476
aegon,199,8,0.987126,0.012874
asoiaf,208,9,0.986467,0.013533
adwd,139,1,0.985323,0.014677
%,160,7,0.981706,0.018294
stannis,262,17,0.980129,0.019871


In [23]:
## creating a column of scaled value differences
term_freq_df['scaled_diff'] = abs(term_freq_df["asoiaf_scaled"].add(-term_freq_df["got_scaled"]))

In [24]:
## separating out our values that occur frequently in both subreddit
stop_phrases = term_freq_df[term_freq_df['scaled_diff'] <= 0.1]
stop_phrases.shape

(153198, 5)

In [25]:
stop_phrases.head()

Unnamed: 0_level_0,asoiaf freq,gameofthrones freq,asoiaf_scaled,got_scaled,scaled_diff
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
welcome,10,3,0.545105,0.454895,0.090211
q,4,0,0.544458,0.455542,0.088916
welcome to,3,0,0.537795,0.462205,0.07559
the weekly,2,0,0.531108,0.468892,0.062217
weekly q,2,0,0.531108,0.468892,0.062217


In [26]:
## making the dataframe only the index (i.e. just the words)
stop_phrases2 = stop_phrases.drop(labels=['asoiaf freq', 
                                          'gameofthrones freq', 
                                          'asoiaf_scaled', 
                                          'got_scaled', 
                                          'scaled_diff'], 
                                 axis=1)
stop_phrases2.head()

welcome
q
welcome to
the weekly
weekly q


In [27]:
## setting our list of stop_phrases to a csv, so we can turn them into a list
## we will later use this list in our model hypertuning
stop_phrases2.to_csv('stop_list.csv', index=True) 

In [29]:
## creating an html 'scattertext' based on scaled data
## code is from scattertext's creator, Jason Kessler
html = produce_scattertext_explorer(corpus,
                                    category='asoiaf',
                                    category_name='ASOIAF',
                                    not_category_name='Game of Thrones',
                                    width_in_pixels=1200,
                                    show_characteristic = False,
                                    minimum_term_frequency=5,
                                   )
file_name = './data/ASOIAF_GoT_Scattertext_Scale.html'  ## saving html file
open(file_name, 'wb').write(html.encode('utf-8'))  ## pulling file from scattertext library
IFrame(src=file_name, width = 1400, height=1000)  ## using Iframe to display our file

In [30]:
## creating an html 'scattertext' based on scaled data
## code is from scattertext's creator, Jason Kessler
html = produce_scattertext_explorer(corpus,
                                    category='asoiaf',
                                    category_name='ASOIAF',
                                    not_category_name='Game of Thrones',
                                    width_in_pixels=1200,
                                    minimum_term_frequency=5,
                                    show_characteristic = False,
                                    transform=st.Scalers.log_scale_standardize
                                   )
file_name = './data/ASOIAF_GoT_Scattertext_Log.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1400, height=1000)

###### Export to CSV

In [31]:
data.to_csv('unique_data.csv', index=False)