#Vocab Shrunk Notebook
This notebook will go through a series of shrinking efforts beginning with the noun and adjective reduced vocabs. It will first consider synonyms and the shrinkage effects. It will then work from the initial shrunken result to consider hypernyms.

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [3]:
decades = [1970,1980,1990,2000,2010]

In [4]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [5]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [6]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

In [7]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [8]:
# function to sort dataframe decsending is the default
def sortDataframe(df,sort_col,ascending=False):
    return df.sort(columns=sort_col, ascending=ascending)

In [9]:
def jsonLoad(json_name,root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    return j

In [10]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    
    j = jsonLoad(json_name,root_in)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)  

In [11]:
# function for loading list of list pairs json to columnar dataframe
def jsonListOfPairListsToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    
    j = jsonLoad(json_name,root_in)
    
    keys = []
    values = []
    for x in j:
        keys.append(x[0])
        values.append(x[1])
        
    d = {key_col_label: listAsAscii(keys), val_col_label: listAsAscii(values)}
    return pd.DataFrame(data=d)    

In [12]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)

In [13]:
# function for json dict to csv
def jsonDictToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonDictToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                             root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [14]:
# function for json list of lists containing 2 entries to csv
def jsonListOfPairListsToCsv(json_name, csv_name, key_col_label="key", val_col_label="value",
                  root_in=root_in, root_out=root_out, index=False, sort_col=None):
    # json to df
    df = jsonListOfPairListsToDataframe(json_name, key_col_label=key_col_label, val_col_label=val_col_label,
                                        root_in=root_in)
    # handle sort
    if sort_col:
        df = sortDataframe(df,sort_col)
    
    # df to csv
    dataframeToCsv(df, csv_name, root_out=root_out, index=index)
    return df

In [15]:
def jsonLoadVocabs(json_name):
    cvocab = jsonLoad(json_name)
    dvocabs = {}
    for decade in decades:
        # change root in for decade
        drootin = "../../data/conditioned/decades/"+str(decade)+"/"
        dvocabs[decade] = jsonLoad(json_name,root_in=drootin)
    
    return cvocab, dvocabs

In [16]:
def pickleLoad(pickle_name,root_in=root_in):
    return pickle.load( open( root_in + pickle_name, "rb" ) )

In [17]:
def pickleLoadVocabs(pickle_name):
    cvocab = pickleLoad(pickle_name)
    dvocabs = {}
    for decade in decades:
        # change root in for decade
        drootin = "../../data/conditioned/decades/"+str(decade)+"/"
        dvocabs[decade] = pickleLoad(pickle_name,root_in=drootin)
    
    return cvocab, dvocabs

In [18]:
# a critical function to translate from corpus index to decade index!!!
def findIdForWord(word,vocab):
    for k,v in vocab:
        if v == word:
            return k
    return -1 # note want to distinguish 0 from None, so using -1 for no results.

In [19]:
# populate a column full of a given decades values from a comp
def compCol(comp,decade):
    didx = decades.index(decade)
    vs = []
    for k,v in comp.iteritems():
        vs.append(v[didx])
        
    return vs

# function to convert comp to dataframe and save
def compToDataframe(comp):
    d = {'word': listAsAscii(comp.keys())}
    
    for decade in decades:
        d[str(decade)] = compCol(comp,decade)
    
    return pd.DataFrame(data=d)

##Load Vocabs

In [20]:
#LOAD VOCABS
cnvocab, dnvocabs = jsonLoadVocabs("nounvocab.json")
cavocab, davocabs = jsonLoadVocabs("adjvocab.json")    

In [21]:
print "How big in corpus noun vocab? ", len(cnvocab)
print "len decade keys (expect 5) --> ", len(dnvocabs.keys())
print "decade keys()[0] --> ",dnvocabs.keys()[0]
print
print "How big in corpus adj vocab? ", len(cavocab)
print "len decade keys (expect 5) --> ", len(davocabs.keys())
print "decade keys()[0] --> ",davocabs.keys()[0]

How big in corpus noun vocab?  5144
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj vocab?  3379
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


In [22]:
#LOAD ID2WORDS
cnid2word, dnid2words = jsonLoadVocabs("nounid2word.json")
caid2word, daid2words = jsonLoadVocabs("adjid2word.json")

In [23]:
print "How big in corpus noun id2word? ", len(cnid2word)
print "len decade keys (expect 5) --> ", len(dnid2words.keys())
print "decade keys()[0] --> ",dnid2words.keys()[0]
print
print "How big in corpus adj id2word? ", len(caid2word)
print "len decade keys (expect 5) --> ", len(daid2words.keys())
print "decade keys()[0] --> ",daid2words.keys()[0]

How big in corpus noun id2word?  5144
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj id2word?  3379
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


##Load Synonyms
**REMEMBER, the individual decade indexing will be different from the master corpus indexing**

In [24]:
#sanity check
print "cnvocab ['sycamore']? ",cnvocab['sycamore']
print "cnid2word ['4446']? ",cnid2word['4446']

cnvocab ['sycamore']?  4446
cnid2word ['4446']?  sycamore


In [25]:
#LOAD SYNONYMS
cnsyn, dnsyns = jsonLoadVocabs("nsyns.json")
casyn, dasyns = jsonLoadVocabs("asyns.json")   

In [26]:
print "How big in corpus noun syns? ", len(cnsyn)
print "len decade keys (expect 5) --> ", len(dnsyns.keys())
print "decade keys()[0] --> ",dnsyns.keys()[0]
print
print "How big in corpus adj syns? ", len(casyn)
print "len decade keys (expect 5) --> ", len(dasyns.keys())
print "decade keys()[0] --> ",dasyns.keys()[0]

How big in corpus noun syns?  3580
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj syns?  1707
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


##Shrunken-1: From Vocab Down to Synonyms

In [27]:
def synEval(csyn):
    u = {} # build the synonym view
    m = {} #multiples
    for k,v in csyn.iteritems():
        if v not in u:
            u[v] = 1
        else:
            u[v] += 1  
            if v in m:
                m[v] += 1
            else:
                m[v] = 2 #multiples
    return u,m

In [28]:
# how many unique noun synonyms
n_s, n_ms = synEval(cnsyn)       
        
print "How many unique nouns (when using synonyms)? ", len(n_s)
print "How many multiples? ", len(n_ms)

print n_ms


How many unique nouns (when using synonyms)?  3230
How many multiples?  292
{u'shop': 2, u'impression': 2, u'bait': 2, u'summer': 2, u'bull': 2, u'urine': 3, u'intuition': 2, u'aroma': 2, u'chink': 2, u'catch': 2, u'fink': 2, u'sleep': 3, u'fillet': 3, u'battle': 2, u'defender': 2, u'speed': 2, u'wage': 3, u'buddy': 2, u'head': 2, u'vibration': 2, u'filth': 2, u'drive': 2, u'pickup': 2, u'pile': 2, u'fad': 2, u'daze': 2, u'crack': 2, u'tune': 2, u'smile': 2, u'criminal': 3, u'hate': 2, u'lookout': 3, u'good': 2, u'hang-up': 2, u'couple': 2, u'material': 2, u'kind': 2, u'clang': 2, u'choice': 3, u'dark': 2, u'lunch': 2, u'spoon': 2, u'buttocks': 4, u'fan': 2, u'breast': 3, u'basement': 2, u'bartender': 2, u'bit': 2, u'jesus': 2, u'twilight': 2, u'day': 2, u'rumor': 2, u'knock': 3, u'die': 2, u'bulge': 2, u'sofa': 3, u'cry': 2, u'freshness': 2, u'morning': 2, u'bag': 3, u'nigger': 2, u'phase': 2, u'macintosh': 2, u'rock': 2, u'guy': 2, u'rear': 2, u'inside': 2, u'draw': 2, u'sweetheart':

**This means:**
* 3,230 nouns are found within the synset, having valid synonyms, to which we standardized to the first result
* 292 synonyms within that result have common_support or shared synonym use. 
* The remainder of the total 5,144 in the noun vocab set (1,914) are not found in the synset and may potentially be ignored to strengthen subsequent vector analysis.

In [29]:
# how many unique adj synonyms
a_s, a_ms = synEval(casyn)       
        
print "How many unique nouns (when using synonyms)? ", len(a_s)
print "How many multiples? ", len(a_ms)

print a_ms

How many unique nouns (when using synonyms)?  1502
How many multiples?  167
{u'exclusive': 2, u'brumous': 2, u'diffident': 2, u'bum': 2, u'domestic': 2, u'lavish': 2, u'distant': 2, u'grateful': 2, u'rough': 2, u'religious': 2, u'fifth': 2, u'fit': 2, u'dramatic': 2, u'fitting': 2, u'besotted': 2, u'permanent': 2, u'black': 2, u'bushy': 2, u'bang-up': 4, u'deadly': 2, u'bigheaded': 2, u'cutting': 2, u'dreamy': 2, u'frigid': 2, u'awful': 2, u'farthermost': 2, u'grim': 2, u'bigger': 2, u'entire': 2, u'colored': 3, u'crisp': 4, u'lost': 2, u'large': 2, u'common': 2, u'double': 2, u'popular': 2, u'obscure': 2, u'ignored': 2, u'small': 2, u'colossal': 2, u'eighteenth': 2, u'dead': 2, u'extremist': 2, u'fabulous': 2, u'bare': 5, u'corrupt': 2, u'ablaze': 3, u'divine': 2, u'aroused': 4, u'casual': 2, u'blue': 4, u'bantam': 2, u'ill-famed': 2, u'instantaneous': 2, u'critical': 2, u'bogus': 3, u'crude': 2, u'burned': 2, u'red': 5, u'hairy': 2, u'ferocious': 3, u'sixth': 2, u'seventh': 2, u'brok

**This means:**
* 1,502 adjectives are found within the synset, having valid synonyms, to which we standardized to the first result
* 167 synonyms within that result have common_support or shared synonym use. 
* The remainder of the total 1,707 in the noun vocab set (205) are not found in the synset and may potentially be ignored to strengthen subsequent vector analysis.**

###Viz prep for synonym use
* `ascomp` and `nscomp` variables below will hold the presence of synonyms per decade.
* will not use `dnsyns` and `dasyns` to avoid indexing confusion.

In [30]:
# get the count for each synonym
def populateDecadeSyns(comp,decade,tsyn,tid2word):
    
    # set decade
    didx = decades.index(decade)
    
    # set syns which have k=id, v=synonym
    csyn = tsyn[0]
    dsyn = tsyn[1][decade]
    
    # set id2word which have k=id, v=word
    cid2word = tid2word[0]
    did2word = tid2word[1][decade]
    
    #loop over corpus syns k=index in id2word, v=synonym
    for k,v in csyn.iteritems():
        
        # attempt to find the id(s) reference of the synonym within `did2word` based on synonym.
        # NOTE: this step is necessary as the ids are not matched between corpus and per decade processing
        refs = []
        for i,s in dsyn.iteritems():
            if s == v:
                refs.append(i)
            
        # if the synonym is present at least once in the decade then account for it in comp
        if len(refs):            
            #print "for synonym: {}, {} id(s) found in decade {}".format(v,len(refs),decade)            
            # determine counts of synonym in decade
            comp[v][didx] = len(refs) 
    
    return comp
    

In [31]:
# set up a structure for each
nscomp = {}
ascomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for k,v in n_s.iteritems():    
    nscomp[k]=[0,0,0,0,0] 
    
# initialize acomp to hold all words with 0 value for each decade
for k,v in a_s.iteritems():    
    ascomp[k]=[0,0,0,0,0] 

In [32]:
# comp for nouns
for d in decades:
    nscomp = populateDecadeSyns(nscomp,d,(cnsyn,dnsyns),(cnid2word,dnid2words))

In [33]:
#verify nscomp
print nscomp.keys()[0]
print nscomp[nscomp.keys()[0]]

inning
[0, 0, 0, 1, 0]


In [34]:
# comp for adjs
for d in decades:
    ascomp = populateDecadeSyns(ascomp,d,(casyn,dasyns),(caid2word,daid2words))

In [35]:
#verify ascomp
print ascomp.keys()[0]
print ascomp[ascomp.keys()[0]]

limited
[0, 0, 0, 1, 1]


###Synonym Comps to Dataframe

In [36]:
nscompdf = compToDataframe(nscomp)

In [37]:
nscompdf.head(10)

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,0,1,0,inning
1,1,1,1,1,1,yellow
2,0,0,1,0,0,hitch
3,1,0,0,0,0,sleet
4,0,0,0,1,0,obstruction
5,0,0,0,1,0,nursery
6,1,1,1,3,1,sleep
7,1,0,0,1,0,railing
8,1,0,1,1,1,appetite
9,0,0,0,1,1,captain


In [38]:
ascompdf = compToDataframe(ascomp)

In [39]:
ascompdf.head(10)

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,0,1,1,limited
1,0,0,1,1,0,dynamic
2,1,1,1,1,1,yellow
3,0,0,1,1,0,sleek
4,1,1,1,1,1,huffy
5,0,0,0,0,1,asian
6,0,0,1,2,0,ill-famed
7,0,0,1,1,0,undisputed
8,0,0,0,1,0,eligible
9,0,0,1,0,0,unanswered


###Save Synonym Comps

In [40]:
# nscompdf
dataframeToCsv(nscompdf,'noun_decade_comp_synonyms.csv')

In [41]:
# ascompdf
dataframeToCsv(ascompdf,'adj_decade_comp_synonyms.csv')

###Add synonym columns to master Dataframe
**This follows from the work in [Vector-Ensemble Notebook](Vector-Ensemble.ipynb). It uses the word vector columns to shorten to synonyms.**

In [42]:
# load the latest master lyricsdf
vectordf = pd.read_csv("../../data/conditioned/master-lyricsdf-word_vectors.csv")  

In [43]:
vectordf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled


In [44]:
def vectorToStr(vector):
    return ' '.join([x.encode('ascii','ignore') if isinstance(x,unicode) else x for x in vector])

In [45]:
# get a synonym vector from a given word vector
def wordVectorToSynVector(wvector,csyn,cid2word):
    
    svector = []
    
    #loop over corpus syns k=index in id2word, v=synonym
    for k,v in csyn.iteritems():
        
        #figure out the normal word use for the index
        word = cid2word[k]
        
        #apply the synonym if present in wordvector, 
        if word and word in wvector:
            svector.append(v)
            
    return svector

In [46]:
# work for the noun_vector and adj_vector columns to build synonyms.
def synonymsFromVectorCol(vectordf,vector_col,syn_col,csyn,cid2word):
    
    syns = []
    
    # build the synonyms    
    for r in vectordf.iterrows():
        words = r[1][vector_col]
        wvector = []
        
        # get words to evaluate into vector form
        if not isinstance(words,float):
            wvector = words.split()
            
        # find synonyms
        svector = []
        if len(wvector):
            svector = wordVectorToSynVector(wvector,csyn,cid2word)
        
        # syn vector to sentence
        s = np.nan
        if len(svector):
            s = vectorToStr(svector)
        
        # append the sentence to syns
        syns.append(s)
    
    # after loop, build a dataframe that adds the column
    vdf = pd.DataFrame({syn_col: syns})
    
    return vectordf.join(vdf)

In [47]:
%%time
#handle noun synonyms
snvdf = synonymsFromVectorCol(vectordf,'noun_vector','noun_syn_vector',cnsyn,cnid2word)

CPU times: user 17.4 s, sys: 152 ms, total: 17.6 s
Wall time: 17.7 s


In [48]:
snvdf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled,time bridge water


In [49]:
%%time
#handle adj synonyms
sanvdf = synonymsFromVectorCol(snvdf,'adj_vector','adj_syn_vector',casyn,caid2word)

CPU times: user 5.99 s, sys: 27.2 ms, total: 6.01 s
Wall time: 6.06 s


In [50]:
sanvdf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled,time bridge water,troubled rough


###Save Dataframe Augmented with Synonym Vectors

In [51]:
dataframeToCsv(sanvdf,"master-lyricsdf-word_syn_vectors.csv",root_out="../../data/conditioned/")

##Load Hypernyms

In [52]:
# load noun hypernyms from file
cnhype, dnhypes = pickleLoadVocabs("nhypes.p")

# load adj hypernyms from file
cahype, dahypes = pickleLoadVocabs("ahypes.p")

In [53]:
print "How big in corpus noun hypes? ", len(cnhype)
print "len decade keys (expect 5) --> ", len(dnhypes.keys())
print "decade keys()[0] --> ",dnhypes.keys()[0]
print
print "How big in corpus adj hypes? ", len(cahype)
print "len decade keys (expect 5) --> ", len(dahypes.keys())
print "decade keys()[0] --> ",dahypes.keys()[0]

How big in corpus noun hypes?  293
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000

How big in corpus adj hypes?  167
len decade keys (expect 5) -->  5
decade keys()[0] -->  2000


**Hypernym len is sized to the number of shared synonyms (2 or more words collapsing to a synonym)**

In [88]:
print "What is the tuple pair from corpus noun hypernym key[0]? ", cnhype.keys()[0]
print "What is the tuple pair synonyms collapsing down from corpus noun hypernym key[0]? ", cnid2word[str(cnhype.keys()[0][0])], cnid2word[str(cnhype.keys()[0][1])]
print "What is the hypernym value for corpus noun hype at key[0]? ", cnhype[cnhype.keys()[0]]

What is the tuple pair from corpus noun hypernym key[0]?  (4412, 4421)
What is the tuple pair synonyms collapsing down from corpus noun hypernym key[0]?  strategy scheme
What is the hypernym value for corpus noun hype at key[0]?  scheme


In [89]:
print "What is the tuple pair from corpus adj hypernym key[0]? ", cahype.keys()[0]
print "What is the tuple pair synonyms collapsing down from corpus adj hypernym key[0]? ", caid2word[str(cahype.keys()[0][0])], cnid2word[str(cahype.keys()[0][1])]
print "What is the hypernym value for corpus adj hype at key[0]? ", cahype[cahype.keys()[0]]

What is the tuple pair from corpus adj hypernym key[0]?  (1581, 1687)
What is the tuple pair synonyms collapsing down from corpus adj hypernym key[0]?  thankful bouncin
What is the hypernym value for corpus adj hype at key[0]?  grateful


###Shrunken-2: From Synonyms Down to Hypernyms

In [83]:
def hyperEval(chype):
    u = {} # build the hypernym view
    m = {} #multiples
    for k,v in chype.iteritems():
        if v not in u:
            u[v] = 1
        else:
            u[v] += 1  
            if v in m:
                m[v] += 1
            else:
                m[v] = 2 #multiples
    return u,m

In [84]:
# how many unique noun hypernyms
n_h, n_mh = hyperEval(cnhype)       
        
print "How many unique nouns (when using hypernyms)? ", len(n_h)
print "How many multiples? ", len(n_mh)

print n_mh

How many unique nouns (when using hypernyms)?  201
How many multiples?  27
{u'lookout': 3, u'urine': 3, u'chap': 6, u'manner': 10, u'girl': 3, u'hood': 3, u'wage': 3, u'crap': 3, u'movie': 3, u'rotter': 3, u'boom': 6, u'animal': 3, u'asshole': 6, u'criminal': 3, u'dad': 15, u'adieu': 3, u'grief': 3, u'purpose': 3, u'attempt': 3, u'clasp': 3, u'buttocks': 6, u'ma': 10, u'ace': 3, u'sofa': 3, u'drip': 3, u'person': 3, u'calamity': 3}


In [85]:
# how many unique adj hypernyms
a_h, a_mh = hyperEval(cahype)       
        
print "How many unique adjs (when using hypernyms)? ", len(a_h)
print "How many multiples? ", len(a_mh)

print a_mh

How many unique adjs (when using hypernyms)?  118
How many multiples?  16
{u'cockamamie': 3, u'bitty': 3, u'ferocious': 3, u'all_right': 3, u'barbarous': 3, u'apparent': 3, u'ageless': 6, u'dizzy': 3, u'bare': 4, u'bogus': 3, u'icky': 6, u'cheery': 3, u'conceited': 3, u'everyday': 3, u'red': 10, u'bang-up': 6}


###Viz prep for hypernym use

In [59]:
# get the count for each synonym
def populateDecadeHypes(comp,decade,thype,tsyn):
    
    #TODO: CONVERT FROM SYNS
    
    # set decade
    didx = decades.index(decade)
    
    # set hypes which have k=id1,id2, v=synonym
    chype = thype[0]
    dhype = thype[1][decade]
    
    # set syns which have k=id, v=synonym
    csyn = tsyn[0]
    dsyn = tsyn[1][decade]
    
    #loop over corpus hype
    for k,v in chype.iteritems():
        
        # attempt to find the id(s) reference of the hypernym within `dhype`.
        # NOTE: this step is necessary as the ids are not matched between corpus and per decade processing
        refs = []
        for i,h in dhype.iteritems():
            if h == v:
                refs.append(i)
            
        # if the hypernym is present at least once in the decade then account for it in comp
        if len(refs):            
            #print "for hypernym: {}, {} id(s) found in decade {}".format(v,len(refs),decade)            
            # determine counts of hypernym in decade
            comp[v][didx] = len(refs) 
    
    return comp

In [60]:
# set up a structure for each
nhcomp = {}
ahcomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for k,v in n_h.iteritems():    
    nhcomp[k]=[0,0,0,0,0] 
    
# initialize acomp to hold all words with 0 value for each decade
for k,v in a_h.iteritems():    
    ahcomp[k]=[0,0,0,0,0] 

In [61]:
# comp for nouns
for d in decades:
    nhcomp = populateDecadeHypes(nhcomp,d,(cnhype,dnhypes),(cnsyn,dnsyns))

In [62]:
#verify nhcomp
print nhcomp.keys()[0]
print nhcomp[nhcomp.keys()[0]]

shop
[1, 1, 1, 1, 0]


In [63]:
# comp for adjs
for d in decades:
    ahcomp = populateDecadeHypes(ahcomp,d,(cahype,dahypes),(casyn,dasyns))

In [64]:
#verify ahcomp
print ahcomp.keys()[0]
print ahcomp[ahcomp.keys()[0]]

exclusive
[0, 0, 0, 1, 0]


###Hypernym Comps to Dataframe

In [65]:
nhcompdf = compToDataframe(nhcomp)

In [66]:
nhcompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,1,1,1,1,0,shop
1,0,0,0,0,0,impression
2,0,0,0,0,0,bait
3,1,1,1,1,1,summer
4,0,1,1,1,0,flicker


In [67]:
ahcompdf = compToDataframe(ahcomp)

In [68]:
ahcompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,0,0,1,0,exclusive
1,0,0,1,1,1,brumous
2,0,1,1,0,0,diffident
3,0,1,1,0,0,particular
4,0,0,1,1,0,grateful


###Save Hypernym Comps

In [69]:
# nhcompdf
dataframeToCsv(nhcompdf,'noun_decade_comp_hypernyms.csv')

# ahcompdf
dataframeToCsv(ahcompdf,'adj_decade_comp_hypernyms.csv')

###Add Hypernym columns to master Dataframe

In [95]:
# build hypernym replacement dict
def synToHypeDict(syn,hype,debug=False):
    d = {}
    
    #loop over hypes k=id 2tuple found in syns, v=hypernym
    for k,v in hype.iteritems():
        
        #open the key which is a tuple for hypes
        #syns uses a string key!!!
        id1=str(k[0]) 
        id2=str(k[1])
                
        #lookup the ids in syns         
        if id1 in syn:
            word1 = syn[id1]            
            d[word1] = v
            if debug:
                print "syn: {}, hype: {}".format(word1,v)
            
        if id2 in syn:
            word2 = syn[id2]            
            d[word2] = v
            if debug:
                print "syn: {}, hype: {}".format(word2,v)
        elif debug:
            print "hypernym not in syns... ", v
            
    return d           

In [96]:
# establish the master dicts
cnsh_dict = synToHypeDict(cnsyn,cnhype,debug=True)

syn: scheme, hype: scheme
syn: scheme, hype: scheme
syn: waist, hype: waist
syn: waist, hype: waist
syn: matter, hype: matter
syn: matter, hype: matter
syn: boom, hype: boom
syn: boom, hype: boom
syn: tune, hype: tune
syn: tune, hype: tune
syn: eden, hype: eden
syn: eden, hype: eden
syn: ma, hype: ma
syn: ma, hype: ma
syn: measure, hype: measure
syn: measure, hype: measure
syn: menace, hype: menace
syn: menace, hype: menace
syn: laugh, hype: laugh
syn: laugh, hype: laugh
syn: bulge, hype: bulge
syn: bulge, hype: bulge
syn: dad, hype: dad
syn: dad, hype: dad
syn: doctor, hype: doctor
syn: doctor, hype: doctor
syn: palette, hype: palette
syn: palette, hype: palette
syn: manner, hype: manner
syn: manner, hype: manner
syn: center, hype: center
syn: center, hype: center
syn: filth, hype: filth
syn: filth, hype: filth
syn: buttocks, hype: buttocks
syn: buttocks, hype: buttocks
syn: baby, hype: baby
syn: baby, hype: baby
syn: rumor, hype: rumor
syn: rumor, hype: rumor
syn: wage, hype: wage
sy

In [97]:
cash_dict = synToHypeDict(casyn,cahype,debug=True)

syn: grateful, hype: grateful
syn: grateful, hype: grateful
syn: casual, hype: casual
syn: casual, hype: casual
syn: barbarous, hype: barbarous
syn: barbarous, hype: barbarous
syn: bare, hype: bare
syn: bare, hype: bare
syn: firm, hype: firm
syn: firm, hype: firm
syn: cryptic, hype: cryptic
syn: cryptic, hype: cryptic
syn: particular, hype: particular
syn: particular, hype: particular
syn: amusing, hype: amusing
syn: amusing, hype: amusing
syn: aroused, hype: aroused
syn: aroused, hype: aroused
syn: nightlong, hype: nightlong
syn: nightlong, hype: nightlong
syn: besotted, hype: besotted
syn: besotted, hype: besotted
syn: bare, hype: bare
syn: bare, hype: bare
syn: barbarous, hype: barbarous
syn: barbarous, hype: barbarous
syn: colored, hype: colored
syn: colored, hype: colored
syn: apparent, hype: apparent
syn: apparent, hype: apparent
syn: icky, hype: icky
syn: icky, hype: icky
syn: all_right, hype: all_right
syn: all_right, hype: all_right
syn: honest, hype: honest
syn: honest, hype:

In [98]:
print "How big is cnsh_dict? ", len(cnsh_dict)
print "How big is cash_dict? ", len(cash_dict)

How big is cnsh_dict?  201
How big is cash_dict?  118


In [72]:
# get a combined synonym + hypernym vector from a given synonym vector
def synVectorToHypeVector(svector,cnsh_dict):    
    u = [] # the return vector    
        
    #loop over the provided vector
    for s in svector:
        
        # initialize v to s
        v = s 
        
        # swap out for hypernym if present
        if s in cnsh_dict:
            v = cnsh_dict[s]

        # only add v if not already in u    
        if v not in u:
            u.append(v) 
            
    return sorted(u) # return sorted

In [73]:
# work for the noun_vector and adj_hypernym vector columns from corresponding synonym column.
def hypernymsFromSynonymVectorCol(syndf,syn_col,hype_col,cnsh_dict):
    
    hypes = []
    
    # build the hypernyms    
    for r in syndf.iterrows():
        syns = r[1][syn_col]
        svector = []
        
        # get words to evaluate into vector form
        if not isinstance(syns,float):
            svector = syns.split()
            
        # find hypernyms
        hvector = []
        if len(svector):
            hvector = synVectorToHypeVector(svector,cnsh_dict)
        
        # hype vector to sentence
        h = np.nan
        if len(hvector):
            h = vectorToStr(hvector)
        
        # append the sentence to hypes
        hypes.append(h)
    
    # after loop, build a dataframe that adds the column
    vdf = pd.DataFrame({hype_col: hypes})
    
    return syndf.join(vdf)

###Quick Test

In [74]:
test_vdf = hypernymsFromSynonymVectorCol(sanvdf.head(50),'noun_syn_vector','noun_syn_hype_vector',cnsh_dict)

###Full Run

In [75]:
%%time
#handle noun hypernyms (starting from synonym df above)
hnvdf = hypernymsFromSynonymVectorCol(sanvdf,'noun_syn_vector','noun_syn_hype_vector',cnsh_dict)

CPU times: user 252 ms, sys: 12.1 ms, total: 265 ms
Wall time: 264 ms


In [76]:
hnvdf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector,noun_syn_hype_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled,time bridge water,troubled rough,bridge time water


In [77]:
%%time
#handle adj hypernyms (picking up from hypernym noun df)
hanvdf = hypernymsFromSynonymVectorCol(hnvdf,'adj_syn_vector','adj_syn_hype_vector',cash_dict)

CPU times: user 241 ms, sys: 4.02 ms, total: 245 ms
Wall time: 244 ms


In [78]:
hanvdf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector,noun_syn_hype_vector,adj_syn_hype_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled,time bridge water,troubled rough,bridge time water,rough troubled


###Save Dataframe Augmented with Hypernym Vectors

In [79]:
dataframeToCsv(hanvdf,"master-lyricsdf-word_syn_hype_vectors.csv",root_out="../../data/conditioned/")