#Tableau Prep

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [3]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [4]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

In [6]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [7]:
# function to sort dataframe decsending is the default
def sortDataframe(df,sort_col,ascending=False):
    return df.sort(columns=sort_col, ascending=ascending)

In [8]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)    

In [9]:
# function for loading list of list pairs json to columnar dataframe
def jsonListOfPairListsToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    keys = []
    values = []
    for x in j:
        keys.append(x[0])
        values.append(x[1])
        
    d = {key_col_label: listAsAscii(keys), val_col_label: listAsAscii(values)}
    return pd.DataFrame(data=d)    

In [10]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)     

In [11]:
# function for json dict to csv
def jsonDictToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonDictToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                             root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [12]:
# function for json list of lists containing 2 entries to csv
def jsonListOfPairListsToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonListOfPairListsToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                                        root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

##N-Gram (Normal)

In [13]:
name=None #this will get set for each conversion
key_col_label = "word" #this will not change for n-gram
val_col_label = "count" #this will not change for n-gram

###Noun

In [14]:
name="noun-n-gram"
nngramdf = jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![noun n-gram](../../viz/noun_n-gram.jpg)

###Adjective

In [15]:
name="adj-n-gram"
angramdf = jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![adjective n-gram](../../viz/adj_n-gram.jpg)

##N-Gram (Reduced)

###Noun

In [16]:
name="noun_n-gram_reduced"
nngramreducedf = jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![noun n-gram reduced](../../viz/noun_n-gram_reduced.jpg)

###Adjective

In [17]:
name="adj_n-gram_reduced"
angramreducedf = jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, sort_col=val_col_label)

![adjective n-gram reduced](../../viz/noun_n-gram_reduced.jpg)

##Combine n-gram with n-gram reduced

###Nouns

In [18]:
nngramdf.shape

(5144, 2)

In [19]:
nngramreducedf.shape

(5144, 2)

In [20]:
nngramjoindf = nngramdf.copy(deep=True)

In [21]:
nngramjoindf['rcount'] = nngramreducedf['count'].values

In [22]:
nngramjoindf.head()

Unnamed: 0,count,word,rcount
1389,2390,love,788
4006,1665,baby,731
1649,1583,girl,629
1218,1544,time,567
1709,1097,thing,533


In [23]:
dataframeToCsv(nngramjoindf,'noun_n-grams_combined.csv')

###Adjectives

In [24]:
angramdf.shape

(3379, 2)

In [25]:
angramreducedf.shape

(3379, 2)

In [26]:
angramjoindf = angramdf.copy(deep=True)

In [27]:
angramjoindf['rcount'] = angramreducedf['count'].values

In [28]:
nngramjoindf.head()

Unnamed: 0,count,word,rcount
1389,2390,love,788
4006,1665,baby,731
1649,1583,girl,629
1218,1544,time,567
1709,1097,thing,533


In [29]:
dataframeToCsv(angramjoindf,'adj_n-grams_combined.csv')

##Decades (CSV Generation only)

In [30]:
decades = [1970,1980,1990,2000,2010]

def makeDecadeCsvs(decade):
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    drootout = root_out+"decades/"+str(decade)+"/"
    
    assureDirExists(drootout)
    
    for name in ["noun-n-gram","adj-n-gram"]:
        jsonDictToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label, 
                      sort_col=val_col_label, root_in=drootin, root_out=drootout)
    
    for name in ["noun_n-gram_reduced","adj_n-gram_reduced"]:
        jsonListOfPairListsToCsv(name+".json",name+".csv", key_col_label=key_col_label, val_col_label=val_col_label,
                      sort_col=val_col_label, root_in=drootin, root_out=drootout)

In [31]:
for d in decades:
    makeDecadeCsvs(d)

##Comparison across Decades
Compare n-grams over decades by counting appearance of words over each decade

In [32]:
# load master noun and adj dict
with open(root_in + 'noun_n-gram_reduced.json', 'r') as fp:
    noun_ngram_reduced = json.load(fp)
    
with open(root_in + 'adj_n-gram_reduced.json', 'r') as fp:
    adj_ngram_reduced = json.load(fp)

In [33]:
len(noun_ngram_reduced)

5144

In [34]:
noun_ngram_reduced[0]

[u'jockin', 1]

In [35]:
len(adj_ngram_reduced)

3379

In [36]:
adj_ngram_reduced[0]

[u'suicidal', 2]

In [37]:
# set up a structure for each 

ncomp = {}
acomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for x in noun_ngram_reduced:    
    ncomp[x[0]]=[0,0,0,0,0]

# initialize acomp to hold all words with 0 value for each decade
for x in adj_ngram_reduced:
    acomp[x[0]]=[0,0,0,0,0]    

In [38]:
# get the count for each word

def populateDecadeWords(comp,decade,json_name):
    
    # set decade
    didx = decades.index(decade)
    
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    
    # read to json
    with open(drootin + json_name, 'r') as fp:
        j = json.load(fp)
    
    #set decade value for each in j
    for x in j:
        comp[x[0]][didx] = x[1]
    
    return comp
    

In [39]:
# ncomp for nouns
for d in decades:
    ncomp = populateDecadeWords(ncomp,d,'noun_n-gram_reduced.json')

In [40]:
#verify ncomp
print ncomp.keys()[0]
print ncomp[ncomp.keys()[0]]

jockin
[0, 0, 1, 0, 0]


In [41]:
# acomp for adjs
for d in decades:
    acomp = populateDecadeWords(acomp,d,'adj_n-gram_reduced.json')

In [42]:
#verify acomp
print acomp.keys()[0]
print acomp[acomp.keys()[0]]

limited
[0, 0, 0, 1, 1]


In [43]:
# populate a column full of a given decades values from a comp
def compCol(comp,decade):
    didx = decades.index(decade)
    vs = []
    for k,v in comp.iteritems():
        vs.append(v[didx])
        
    return vs

# function to convert comp to dataframe and save
def compToDataframe(comp):
    d = {'word': listAsAscii(comp.keys())}
    
    for decade in decades:
        d[str(decade)] = compCol(comp,decade)
    
    return pd.DataFrame(data=d)

In [44]:
ncompdf = compToDataframe(ncomp)

In [45]:
ncompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,1,0,0,jockin
1,0,0,0,1,0,inning
2,0,0,1,0,0,girl(oh
3,1,1,1,2,1,yellow
4,1,0,0,0,0,sleet


In [46]:
acompdf = compToDataframe(acomp)

In [47]:
acompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,0,1,1,limited
1,0,0,0,0,1,our-our-our-ou-ou-ours
2,0,0,1,1,0,suicidal
3,0,0,0,1,0,ri-dic-dic-dic-ulous
4,0,0,1,2,0,dynamic


###Save Comp to CSV

In [48]:
# ncompdf
dataframeToCsv(ncompdf,'noun_decade_comp_reduced.csv')

In [49]:
# acompdf
dataframeToCsv(acompdf,'adj_decade_comp_reduced.csv')

##Word-Counts for Appearances in 1 or more decade

In [71]:
def countAppearances(comp):
    
    spanning_dict = {}
    spanning_count_dict = {}
    
    #init
    for x in range(0,6):
        spanning_dict['spanning-'+str(x)] = []
        spanning_count_dict['spanning-'+str(x)] = {}
    
    print "len spanning and spanning count dicts --> ", len(spanning_dict)    
    
    word_count_dict = {} # hold raw word counts
    
    # build up decade spanning words (1-5)
    # keept track of counts
    for word,decades in comp.iteritems():
        c = 0
        for dc in decades:
            if dc: #meaning, if the value is > 0
                c += 1
                # keep track of actual total use of word
                if word in word_count_dict:
                    word_count_dict[word] += dc
                else:
                    word_count_dict[word] = dc
                    
        spanning_dict['spanning-'+str(c)].append(word)
            
    # get the 5 decade counts right for sorting
    for sk,sv in spanning_dict.iteritems():
        for v in sv:
            spanning_count_dict[sk][v] = word_count_dict[v]
        
    return spanning_dict, spanning_count_dict, word_count_dict 

In [72]:
ncs = countAppearances(ncomp)

len spanning and spanning count dicts -->  6


In [76]:
ncs[1]['spanning-5']

{u'air': 55,
 u'angel': 25,
 u'answer': 18,
 u'anybody': 18,
 u'arm': 90,
 u'babe': 54,
 u'baby': 629,
 u'ball': 27,
 u'band': 22,
 u'bar': 44,
 u'beach': 16,
 u'beat': 52,
 u'beating': 13,
 u'beauty': 18,
 u'bed': 47,
 u'beer': 18,
 u'bird': 25,
 u'bit': 124,
 u'blast': 10,
 u'block': 30,
 u'blood': 26,
 u'blow': 33,
 u'blue': 41,
 u'board': 9,
 u'body': 145,
 u'bone': 24,
 u'book': 18,
 u'boot': 16,
 u'bottle': 35,
 u'bout': 93,
 u'boy': 283,
 u'brain': 29,
 u'brand': 80,
 u'bread': 13,
 u'break': 27,
 u'breath': 26,
 u'breeze': 19,
 u'brother': 47,
 u'bunch': 10,
 u'burning': 10,
 u'bus': 10,
 u'business': 15,
 u'cab': 5,
 u'candle': 9,
 u'car': 104,
 u'card': 11,
 u'care': 16,
 u'case': 31,
 u'cat': 41,
 u'cause': 285,
 u'chance': 66,
 u'change': 46,
 u'chick': 65,
 u'child': 73,
 u'circle': 10,
 u'city': 54,
 u'clock': 11,
 u'clothe': 40,
 u'cloud': 20,
 u'clown': 7,
 u'coffee': 8,
 u'cold': 32,
 u'color': 23,
 u'come': 15,
 u'control': 33,
 u'conversation': 21,
 u'cool': 25,
 u'c

In [79]:
acs = countAppearances(acomp)

len spanning and spanning count dicts -->  6


In [81]:
acs[1]['spanning-5']

{u'able': 14,
 u'afraid': 62,
 u'alive': 107,
 u'answer': 12,
 u'ashamed': 17,
 u'average': 11,
 u'bad': 286,
 u'beat': 50,
 u'beautiful': 99,
 u'best': 228,
 u'better': 233,
 u'big': 339,
 u'bigger': 29,
 u'biggest': 14,
 u'bitter': 15,
 u'black': 135,
 u'blind': 75,
 u'blue': 136,
 u'break': 32,
 u'bright': 79,
 u'brighter': 21,
 u'broken': 85,
 u'brown': 46,
 u'busy': 31,
 u'certain': 17,
 u'cheap': 19,
 u'cherry': 14,
 u'clean': 42,
 u'clear': 73,
 u'clearer': 9,
 u'close': 29,
 u'closed': 12,
 u'closer': 47,
 u'closest': 5,
 u'cold': 167,
 u'colder': 14,
 u'come': 45,
 u'common': 17,
 u'complete': 22,
 u'confused': 16,
 u'cool': 82,
 u'corner': 11,
 u'crazy': 226,
 u'cruel': 17,
 u'cute': 35,
 u'damn': 33,
 u'damned': 9,
 u'dancin': 10,
 u'dark': 133,
 u'darkest': 17,
 u'daytime': 7,
 u'dead': 89,
 u'dear': 24,
 u'deep': 168,
 u'deeper': 23,
 u'different': 106,
 u'dirty': 76,
 u'divine': 14,
 u'double': 31,
 u'drunk': 36,
 u'dry': 25,
 u'dumb': 22,
 u'early': 17,
 u'east': 22,
 u'

###Save a histogram of  results

In [55]:
def appearanceCountsToHistogram(ctuple):
    d = {}
    idx = 0 #want to effectively start with 1
    for c in ctuple:
        idx += 1
        d['{}-decade'.format(idx)] = [len(c)]
    
    return pd.DataFrame(data=d)

In [56]:
ncshistdf = appearanceCountsToHistogram(ncs)
ncshistdf.head()

Unnamed: 0,1-decade,2-decade,3-decade,4-decade,5-decade
0,2994,913,490,328,419


In [57]:
dataframeToCsv(ncshistdf,'noun_decade_count.csv')

In [58]:
acshistdf = appearanceCountsToHistogram(acs)
acshistdf.head()

Unnamed: 0,1-decade,2-decade,3-decade,4-decade,5-decade
0,2127,547,273,172,260


In [59]:
dataframeToCsv(acshistdf,'adj_decade_count.csv')

###Dump the 5-decade words

In [82]:
ns5s = ncs[1]['spanning-5']
dataframeToCsv(pd.DataFrame(data={'5-decade':ns5s.keys(), 'count':ns5s.values()}),'nouns_5-decade_spanners.csv')

In [83]:
as5s = acs[1]['spanning-5']
dataframeToCsv(pd.DataFrame(data={'5-decade':as5s.keys(), 'count':as5s.values()}),'adjs_5-decade_spanners.csv')

##Offensive Word-Counts for Appearances in 1 or more decade
**These were prepped in [Profanity-Extraction Notebook](Profanity-Extraction.ipynb)**

##Hypernym and Synonym Prep
**These were prepped in [Vocab-Shrunk Notebook](Vocab-Shrunk.ipynb)**

##Vector Ensemble Prep
**These were prepped in [Vector-Ensemble Notebook](Vector-Ensemble.ipynb)**