#Tableau Prep

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [3]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [4]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

In [6]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [7]:
# function to sort dataframe decsending is the default
def sortDataframe(df,sort_col,ascending=False):
    return df.sort(columns=sort_col, ascending=ascending)

In [8]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)    

In [9]:
# function for loading list of list pairs json to columnar dataframe
def jsonListOfPairListsToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    keys = []
    values = []
    for x in j:
        keys.append(x[0])
        values.append(x[1])
        
    d = {key_col_label: listAsAscii(keys), val_col_label: listAsAscii(values)}
    return pd.DataFrame(data=d)    

In [10]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)     

In [73]:
# function for json dict to csv
def jsonDictToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonDictToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                             root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [74]:
# function for json list of lists containing 2 entries to csv
def jsonListOfPairListsToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonListOfPairListsToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                                        root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

##N-Gram (Normal)

In [13]:
name=None #this will get set for each conversion
key_col_label = "word" #this will not change for n-gram
val_col_label = "count" #this will not change for n-gram

###Noun

In [75]:
name="noun-n-gram"
nngramdf = jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![noun n-gram](../../viz/noun_n-gram.jpg)

###Adjective

In [76]:
name="adj-n-gram"
angramdf = jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![adjective n-gram](../../viz/adj_n-gram.jpg)

##N-Gram (Reduced)

###Noun

In [77]:
name="noun_n-gram_reduced"
nngramreducedf = jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![noun n-gram reduced](../../viz/noun_n-gram_reduced.jpg)

###Adjective

In [78]:
name="adj_n-gram_reduced"
angramreducedf = jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![adjective n-gram reduced](../../viz/noun_n-gram_reduced.jpg)

##Combine n-gram with n-gram reduced

###Nouns

In [80]:
nngramdf.shape

(5144, 2)

In [82]:
nngramreducedf.shape

(5144, 2)

In [84]:
nngramjoindf = nngramdf.copy(deep=True)

In [86]:
nngramjoindf['rcount'] = nngramreducedf['count'].values

In [88]:
nngramjoindf.head()

Unnamed: 0,count,word,rcount
1389,2390,love,788
4006,1665,baby,731
1649,1583,girl,629
1218,1544,time,567
1709,1097,thing,533


In [89]:
dataframeToCsv(nngramjoindf,'noun_n-grams_combined.csv')

###Adjectives

In [81]:
angramdf.shape

(3379, 2)

In [83]:
angramreducedf.shape

(3379, 2)

In [90]:
angramjoindf = angramdf.copy(deep=True)

In [91]:
angramjoindf['rcount'] = angramreducedf['count'].values

In [92]:
nngramjoindf.head()

Unnamed: 0,count,word,rcount
1389,2390,love,788
4006,1665,baby,731
1649,1583,girl,629
1218,1544,time,567
1709,1097,thing,533


In [93]:
dataframeToCsv(angramjoindf,'adj_n-grams_combined.csv')

##Decades (CSV Generation only)

In [18]:
decades = [1970,1980,1990,2000,2010]

def makeDecadeCsvs(decade):
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    drootout = root_out+"decades/"+str(decade)+"/"
    
    assureDirExists(drootout)
    
    for name in ["noun-n-gram","adj-n-gram"]:
        jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, 
                      sort_col=val_col_label, root_in=drootin, root_out=drootout)
    
    for name in ["noun_n-gram_reduced","adj_n-gram_reduced"]:
        jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label,
                      sort_col=val_col_label, root_in=drootin, root_out=drootout)

In [19]:
for d in decades:
    makeDecadeCsvs(d)

##Comparison across Decades
Compare n-grams over decades by counting appearance of words over each decade

In [20]:
# load master noun and adj dict
with open(root_in + 'noun_n-gram_reduced.json', 'r') as fp:
    noun_ngram_reduced = json.load(fp)
    
with open(root_in + 'adj_n-gram_reduced.json', 'r') as fp:
    adj_ngram_reduced = json.load(fp)

In [21]:
len(noun_ngram_reduced)

5144

In [24]:
noun_ngram_reduced[0]

[u'jockin', 1]

In [22]:
len(adj_ngram_reduced)

3379

In [26]:
adj_ngram_reduced[0]

[u'suicidal', 2]

In [25]:
# set up a structure for each 

ncomp = {}
acomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for x in noun_ngram_reduced:    
    ncomp[x[0]]=[0,0,0,0,0]

# initialize acomp to hold all words with 0 value for each decade
for x in adj_ngram_reduced:
    acomp[x[0]]=[0,0,0,0,0]    

In [33]:
# get the count for each word

def populateDecadeWords(comp,decade,json_name):
    
    # set decade
    didx = decades.index(decade)
    
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    
    # read to json
    with open(drootin + json_name, 'r') as fp:
        j = json.load(fp)
    
    #set decade value for each in j
    for x in j:
        comp[x[0]][didx] = x[1]
    
    return comp
    

In [34]:
# ncomp for nouns
for d in decades:
    ncomp = populateDecadeWords(ncomp,d,'noun_n-gram_reduced.json')

In [35]:
#verify ncomp
print ncomp.keys()[0]
print ncomp[ncomp.keys()[0]]

jockin
[0, 0, 1, 0, 0]


In [36]:
# acomp for adjs
for d in decades:
    acomp = populateDecadeWords(acomp,d,'adj_n-gram_reduced.json')

In [37]:
#verify acomp
print acomp.keys()[0]
print acomp[acomp.keys()[0]]

limited
[0, 0, 0, 1, 1]


In [44]:
# populate a column full of a given decades values from a comp
def compCol(comp,decade):
    didx = decades.index(decade)
    vs = []
    for k,v in comp.iteritems():
        vs.append(v[didx])
        
    return vs

# function to convert comp to dataframe and save
def compToDataframe(comp):
    d = {'word': listAsAscii(comp.keys())}
    
    for decade in decades:
        d[str(decade)] = compCol(comp,decade)
    
    return pd.DataFrame(data=d)

In [45]:
ncompdf = compToDataframe(ncomp)

In [46]:
ncompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,1,0,0,jockin
1,0,0,0,1,0,inning
2,0,0,1,0,0,girl(oh
3,1,1,1,2,1,yellow
4,1,0,0,0,0,sleet


In [47]:
acompdf = compToDataframe(acomp)

In [48]:
acompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,0,1,1,limited
1,0,0,0,0,1,our-our-our-ou-ou-ours
2,0,0,1,1,0,suicidal
3,0,0,0,1,0,ri-dic-dic-dic-ulous
4,0,0,1,2,0,dynamic


###Save Comp to CSV

In [49]:
# ncompdf
dataframeToCsv(ncompdf,'noun_decade_comp_reduced.csv')

In [50]:
# acompdf
dataframeToCsv(acompdf,'adj_decade_comp_reduced.csv')

##Word-Counts for Appearances in 1 or more decade

In [54]:
def countAppearances(comp):
    c1 = []
    c2 = []
    c3 = []
    c4 = []
    c5 = []
    
    for k,v in comp.iteritems():
        c = 0
        for x in v:
            if x:
                c += 1
        if x == 5:
            c5.append(k)
        elif x == 4:
            c4.append(k)
        elif x == 3:
            c3.append(k)
        elif x == 2:
            c2.append(k)
        elif x == 1:
            c1.append(k)
    return c1,c2,c3,c4,c5

In [55]:
ncs = countAppearances(ncomp)

In [56]:
ncs[4]

[u'half',
 u'end',
 u'beauty',
 u'wine',
 u'ball',
 u'gon',
 u'sippin',
 u'mouth',
 u'flow',
 u'team',
 u'ice',
 u'trouble',
 u'water',
 u'memory',
 u'ooh',
 u'feelin',
 u'work',
 u'suit',
 u'college',
 u'morning',
 u'sound',
 u'pop',
 u'start',
 u'building',
 u'trippin',
 u'kick',
 u'dress',
 u'beer',
 u'plan',
 u'buzz',
 u'pound',
 u'livin',
 u'hater',
 u'phone',
 u'jump',
 u'cloud',
 u'high',
 u'skirt',
 u'animal',
 u'bag',
 u'type',
 u'drunk',
 u'brain',
 u'record',
 u'tattoo',
 u'crack',
 u'wish']

In [57]:
acs = countAppearances(acomp)

In [58]:
acs[4]

[u'worth',
 u'dry',
 u'golden',
 u'fresher',
 u'break',
 u'twisted',
 u'rich',
 u'hotter',
 u'welcome',
 u'baddest',
 u'american',
 u'clean',
 u'nervous',
 u'movin',
 u'wait',
 u'fake',
 u'animal',
 u'light',
 u'wet',
 u'lean']

###Save a histogram of  results

In [66]:
def appearanceCountsToHistogram(ctuple):
    d = {}
    idx = 0 #want to effectively start with 1
    for c in ctuple:
        idx += 1
        d['{}-decade'.format(idx)] = [len(c)]
    
    return pd.DataFrame(data=d)

In [67]:
ncshistdf = appearanceCountsToHistogram(ncs)
ncshistdf.head()

Unnamed: 0,1-decade,2-decade,3-decade,4-decade,5-decade
0,944,264,121,68,47


In [68]:
dataframeToCsv(ncshistdf,'noun_decade_count.csv')

In [69]:
acshistdf = appearanceCountsToHistogram(acs)
acshistdf.head()

Unnamed: 0,1-decade,2-decade,3-decade,4-decade,5-decade
0,557,136,58,36,20


In [70]:
dataframeToCsv(acshistdf,'adj_decade_count.csv')

###Dump the 5-decade words

In [71]:
dataframeToCsv(pd.DataFrame(data={'5-decade':ncs[4]}),'nouns_5-decade_spanners.csv')

In [72]:
dataframeToCsv(pd.DataFrame(data={'5-decade':acs[4]}),'adjs_5-decade_spanners.csv')

##Offensive Word-Counts for Appearances in 1 or more decade
**These were prepped in [Profanity-Extraction Notebook](Profanity-Extraction.ipynb)**