#Vocab Shrunk Notebook
This notebook will go through a series of shrinking efforts beginning with the noun and adjective reduced vocabs. It will first consider synonyms and the shrinkage effects. It will then work from the initial shrunken result to consider hypernyms.

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [3]:
decades = [1970,1980,1990,2000,2010]

In [4]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [5]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [6]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

In [7]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [8]:
# function to sort dataframe decsending is the default
def sortDataframe(df,sort_col,ascending=False):
    return df.sort(columns=sort_col, ascending=ascending)

In [9]:
def jsonLoad(json_name,root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    return j

In [10]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    
    j = jsonLoad(json_name,root_in)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)  

In [11]:
# function for loading list of list pairs json to columnar dataframe
def jsonListOfPairListsToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    
    j = jsonLoad(json_name,root_in)
    
    keys = []
    values = []
    for x in j:
        keys.append(x[0])
        values.append(x[1])
        
    d = {key_col_label: listAsAscii(keys), val_col_label: listAsAscii(values)}
    return pd.DataFrame(data=d)    

In [12]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)

In [13]:
# function for json dict to csv
def jsonDictToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonDictToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                             root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [14]:
# function for json list of lists containing 2 entries to csv
def jsonListOfPairListsToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonListOfPairListsToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                                        root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [19]:
def jsonLoadVocabs(json_name):
    cvocab = jsonLoad(json_name)
    dvocabs = {}
    for decade in decades:
        # change root in for decade
        drootin = "../../data/conditioned/decades/"+str(decade)+"/"
        dvocabs[decade] = jsonLoad(json_name,root_in=drootin)
    
    return cvocab, dvocabs

##Load Vocabs

In [20]:
#LOAD VOCABS
cnvocab, dnvocabs = jsonLoadVocabs("nounvocab.json")
cavocab, davocabs = jsonLoadVocabs("adjvocab.json")    

In [29]:
print "How big in corpus noun vocab? ", len(cnvocab)
print "len decade keys (expect 5) --> ", len(dnvocabs.keys())
print "decade keys()[0] --> ",dnvocabs.keys()[0]
print
print "How big in corpus adj vocab? ", len(cavocab)
print "len decade keys (expect 5) --> ", len(davocabs.keys())
print "decade keys()[0] --> ",davocabs.keys()[0]

How big in corpus noun vocab?  5144
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj vocab?  3379
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


##Load Synonyms

In [30]:
#LOAD SYNONYMS
cnsyn, dnsyns = jsonLoadVocabs("nsyns.json")
casyn, dasyns = jsonLoadVocabs("asyns.json")   

In [31]:
print "How big in corpus noun syns? ", len(cnsyn)
print "len decade keys (expect 5) --> ", len(dnsyns.keys())
print "decade keys()[0] --> ",dnsyns.keys()[0]
print
print "How big in corpus adj syns? ", len(casyn)
print "len decade keys (expect 5) --> ", len(dasyns.keys())
print "decade keys()[0] --> ",dasyns.keys()[0]

How big in corpus noun syns?  3580
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj syns?  1707
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


##Shrunken-1: From Vocab Down to Synonyms

In [47]:
def synEval(csyn):
    u = {}
    m = {} #multiples
    for k,v in csyn.iteritems():
        if v not in u:
            u[v] = 1
        else:
            u[v] += 1  
            if v in m:
                m[v] += 1
            else:
                m[v] = 2 #multiples
    return u,m

In [51]:
# how many unique noun synonyms
n_s, n_ms = synEval(cnsyn)       
        
print "How many unique nouns (when using synonyms)? ", len(n_s)
print "How many multiples? ", len(n_ms)

print n_ms


How many unique nouns (when using synonyms)?  3230
How many multiples?  292
{u'shop': 2, u'impression': 2, u'bait': 2, u'summer': 2, u'bull': 2, u'urine': 3, u'intuition': 2, u'aroma': 2, u'chink': 2, u'catch': 2, u'fink': 2, u'sleep': 3, u'fillet': 3, u'battle': 2, u'defender': 2, u'speed': 2, u'wage': 3, u'buddy': 2, u'head': 2, u'vibration': 2, u'filth': 2, u'drive': 2, u'pickup': 2, u'pile': 2, u'fad': 2, u'daze': 2, u'crack': 2, u'tune': 2, u'smile': 2, u'criminal': 3, u'hate': 2, u'lookout': 3, u'good': 2, u'hang-up': 2, u'couple': 2, u'material': 2, u'kind': 2, u'clang': 2, u'choice': 3, u'dark': 2, u'lunch': 2, u'spoon': 2, u'buttocks': 4, u'fan': 2, u'breast': 3, u'basement': 2, u'bartender': 2, u'bit': 2, u'jesus': 2, u'twilight': 2, u'day': 2, u'rumor': 2, u'knock': 3, u'die': 2, u'bulge': 2, u'sofa': 3, u'cry': 2, u'freshness': 2, u'morning': 2, u'bag': 3, u'nigger': 2, u'phase': 2, u'macintosh': 2, u'rock': 2, u'guy': 2, u'rear': 2, u'inside': 2, u'draw': 2, u'sweetheart':

In [50]:
# how many unique adj synonyms
a_s, a_ms = synEval(casyn)       
        
print "How many unique nouns (when using synonyms)? ", len(a_s)
print "How many multiples? ", len(a_ms)

print a_ms

How many unique nouns (when using synonyms)?  1502
How many multiples?  167
{u'exclusive': 2, u'brumous': 2, u'diffident': 2, u'bum': 2, u'domestic': 2, u'lavish': 2, u'distant': 2, u'grateful': 2, u'rough': 2, u'religious': 2, u'fifth': 2, u'fit': 2, u'dramatic': 2, u'fitting': 2, u'besotted': 2, u'permanent': 2, u'black': 2, u'bushy': 2, u'bang-up': 4, u'deadly': 2, u'bigheaded': 2, u'cutting': 2, u'dreamy': 2, u'frigid': 2, u'awful': 2, u'farthermost': 2, u'grim': 2, u'bigger': 2, u'entire': 2, u'colored': 3, u'crisp': 4, u'lost': 2, u'large': 2, u'common': 2, u'double': 2, u'popular': 2, u'obscure': 2, u'ignored': 2, u'small': 2, u'colossal': 2, u'eighteenth': 2, u'dead': 2, u'extremist': 2, u'fabulous': 2, u'bare': 5, u'corrupt': 2, u'ablaze': 3, u'divine': 2, u'aroused': 4, u'casual': 2, u'blue': 4, u'bantam': 2, u'ill-famed': 2, u'instantaneous': 2, u'critical': 2, u'bogus': 3, u'crude': 2, u'burned': 2, u'red': 5, u'hairy': 2, u'ferocious': 3, u'sixth': 2, u'seventh': 2, u'brok

In [None]:
# WHAT ELSE?

##Load Hypernyms

In [None]:
#LOAD HYPERNYMS
#TODO

In [None]:
"""
# set up a structure for each
ncomp = {}
acomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for x in offensives:    
    ncomp[x]=[0,0,0,0,0] 
    
# initialize acomp to hold all words with 0 value for each decade
for x in offensives:    
    ncomp[x]=[0,0,0,0,0] 
"""