#Vocab Consolidation
### Adapted concepts from [HW1](https://github.com/cs109-students/michaeljohns-2015hw/blob/hw1/hw1.ipynb) and [HW5 Part1](https://github.com/cs109-students/michaeljohns-2015hw/blob/hw5/hw5part1.ipynb)

**This notebook should be locally run by issuing `vagrant up` from project root, then locating the notebook at "http:\\localhost:4545". You may also need to issue `vagrant provision` to update any required resources.**

The following artifacts will be established by manipulating the output of the processing pipeline for harvesting data, file [use-this-master-lyricsdf-extracted.csv](../../data/conditioned/use-this-master-lyricsdf-extracted.csv):
* vocabs for noun and adj
* n-gram for noun and adj
* synonyms for noun and adj
* hypernyms for noun and adj

Other notes:
* this notebook leverages and finalizes exploratory work in [Data-Exploration Notebook](Data-Exploration.ipynb).
* outputs are anticipated to be combined in follow-on work for better latent factors, prediction, and recommendation processing (not reflected here)
* in other notebooks that use the exact same contents as here, we will establish n-gram and vocab per decade.



In [None]:
## SET THE DECADE FOR PROCESS FILTERING
## THIS WILL ALLOW SPECIAL PROCESSING
decade = None # for no decade filtering, i.e. corpus-wide
# decade = 1970
# decade = 1980
# decade = 1990
# decade = 2000
# decade = 2010

##Imports

In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [None]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

##Handle Directory for Output

In [None]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [None]:
# create requisite directory for processing
root_out = ""
if not decade:
    root_out = "../../data/conditioned/corpus_vocabs/" #entire corpus
else:
    root_out = "../../data/conditioned/decades/"+str(decade)+"/" #single decade
    
assureDirExists(root_out)

##Spark Setup

In [None]:
import os
# os.environ['PYSPARK_PYTHON'] = '/anaconda/bin/python'

In [None]:
import findspark
findspark.init()
print findspark.find()
# Depending on your setup you might have to change this line of code
#findspark makes sure I dont need the below on homebrew.
#os.environ['SPARK_HOME']="/usr/local/Cellar/apache-spark/1.5.1/libexec/"
#the below actually broke my spark, so I removed it. 
#Depending on how you started the notebook, you might need it.
# os.environ['PYSPARK_SUBMIT_ARGS']="--master local pyspark --executor-memory 4g"

In [None]:
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local[4]')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)

In [None]:
sc._conf.getAll()

In [None]:
import sys
rdd = sc.parallelize(xrange(10),10)
rdd.map(lambda x: sys.version).collect()

In [None]:
sys.version

In [None]:
from pyspark.sql import SQLContext
sqlsc=SQLContext(sc)

#Load Finalized Conditioned Data Into Pandas Dataframe

In [None]:
# load the lyrics from the approved "master" dataframe
lyrics_pd_df = pd.read_csv("../../data/conditioned/use-this-master-lyricsdf-extracted.csv")  

In [None]:
#FILTER BY DECADE IF SET
if decade:
    lyrics_pd_df = lyrics_pd_df[lyrics_pd_df['decade'] == decade]

In [None]:
lyrics_pd_df.shape

In [None]:
lyrics_pd_df.head()

##Manipulate With Spark

In [None]:
# convert from pandas to spark dataframe
lyricsdf = sqlsc.createDataFrame(lyrics_pd_df)

In [None]:
# view output
lyricsdf.show(3)

In [None]:
#view output
lyricsdf.show(3)

In [None]:
#We cache the data to make sure it is only read once from disk
lyricsdf.cache()
print "How many songs do we have?", lyricsdf.count()

In [None]:
print "What is the schema?", lyricsdf.printSchema()

##Sample Lyrics (or Not)

Some initial sampling to take from each year.

In [None]:
# whether or not to sample lyrics, and how many to sample per year
sample_lyrics = False
PER_YEAR_SAMPLES=10

In [None]:
#(your code here)
def randomSubSampleLyrics(sparkdf,take=PER_YEAR_SAMPLES):    
    # generate spark pairs as a tuple
    br_pairs = sparkdf.map(lambda r: (r.year, r.song_key))
    
    # group by key for a list of reviews per business and collect
    br_grouped = br_pairs.groupByKey().mapValues(lambda x: list(x)).collect()
        
    #sample after collect
    br_sample = [np.random.choice(v, size=take, replace=False) for k,v in br_grouped]    
    
    #flatten into a list
    return list(itertools.chain.from_iterable(br_sample))
    
small_song_keys = randomSubSampleLyrics(lyricsdf)

In [None]:
if sample_lyrics:
    print "How many small_song_keys? ", len(small_song_keys)
    small_song_keys[:5]
else:
    print "No lyric sampling, full processing (change `sample_lyrics` value to `True` to sample)"

In [None]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

In [None]:
%%time
#(your code here)
if sample_lyrics:
    ldf=lyricsdf[lyricsdf.song_key.isin(small_song_keys)]#creates new dataframe
else:
    ldf=lyricsdf

In [None]:
# cache results
ldf.cache()

In [None]:
print "How many lyrics are in ldf? ", ldf.count()

##NLP

In [None]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [None]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

In [None]:
import re
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")

In [None]:
print "Quick Test of parse..."
parse("The world is the craziest place. I am working hard.", tokenize=True, lemmata=True)

In [None]:
def get_parts(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [None]:
print "Quick check of get_parts ..."
get_parts("Have had many other items and just love the food. The patio...job was and...perfect. Lunch is good, and the only egg is great")

###Run Get Parts on Provided Data

In [None]:
#(your code here)
lyric_parts = ldf.map(lambda r : get_parts(r.lyrics))

In [None]:
# view output
lyric_parts.take(2)

In [None]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

In [None]:
%%time
parseout=lyric_parts.collect()

##Vocab
###Nouns

In [None]:
print "How many parseout entries? ", len(parseout)

In [None]:
# flatten parseout to create initial noun rdd
nounrdd=sc.parallelize([ele[0] for ele in parseout]).flatMap(lambda l: l)

In [None]:
# view output
nounrdd.take(5)

In [None]:
# cache results
nounrdd.cache()

In [None]:
# straight reduce for overall word counts
nwordsrdd = (nounrdd.flatMap(lambda word: word)
             .map(lambda word: (word, 1))
             .reduceByKey(lambda a, b: a + b)
)

In [None]:
# view output
nwordsrdd.take(5)

In [None]:
# top n, based on values, sorted descending
nwordsrdd.takeOrdered(10, key = lambda x: -x[1])

In [None]:
nwordsrdd.cache()

In [None]:
# collect all the words and cache
nounvocabtups = (nwordsrdd
             .map(lambda (x,y): x)
             .zipWithIndex()
)

In [None]:
# view output
nounvocabtups.take(3)

In [None]:
# cache results
nounvocabtups.cache()

In [None]:
# collect results
nounvocab=nounvocabtups.collectAsMap()
nounid2word=nounvocabtups.map(lambda (x,y): (y,x)).collectAsMap()

In [None]:
# since sampling may be used, avoiding more common usage, e.g. `nounvocab['dance']`
nounid2word[0], nounvocab.keys()[5], nounvocab[nounvocab.keys()[5]]

In [None]:
print "How big is the noun vocabulary? ", len(nounvocab.keys())

###Adjectives

In [None]:
# create initial adj rdd from parseout
adjrdd=sc.parallelize([ele[1] for ele in parseout])

In [None]:
# view output
adjrdd.take(3)

In [None]:
# cache results
adjrdd.cache()

In [None]:
# straight reduce for overall word counts
awordsrdd = (adjrdd
             .flatMap(lambda l: l)
             .flatMap(lambda word: word)
             .map(lambda word: (word, 1))
             .reduceByKey(lambda a, b: a + b)
)

In [None]:
# view output
awordsrdd.take(5)

In [None]:
# top n, based on values, sorted descending
awordsrdd.takeOrdered(10, key = lambda x: -x[1])

In [None]:
# cache results
awordsrdd.cache()

In [None]:
#(your code here)
adjvocabtups = (awordsrdd
              .map(lambda (x,y): x)
              .zipWithIndex()
)

In [None]:
# view output
adjvocabtups.take(3)

In [None]:
# cache results
adjvocabtups.cache()

In [None]:
# collect results
adjvocab=adjvocabtups.collectAsMap()
adjid2word=adjvocabtups.map(lambda (x,y): (y,x)).collectAsMap()

In [None]:
# since sampling may be used, avoiding more common usage, e.g. `adjvocab['exotic']`
adjid2word[0], adjvocab.keys()[5], adjvocab[adjvocab.keys()[5]]

In [None]:
print "How big is the adjective vocabulary? ", len(adjvocab)

##Document Corpus

In [None]:
##################################################################################################
# CITATION - Use of counter for reduce within each word list from:
# http://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item-in-python
##################################################################################################
from collections import Counter

# for each sentence, reduct into a list of tuple k,v where k=vocab index and v=count, 
# each word list is sorted by occurence
documents = nounrdd.map(lambda words: Counter([nounvocab[word] for word in words]).most_common())

In [None]:
# verify output
documents.take(1)

In [None]:
# gather spark results
corpus=documents.collect()

##Save Spark Conditioning

###Part of Speech Nouns / Adjectives (Original Lyrics Array)

In [None]:
ncollect = sc.parallelize([ele[0] for ele in parseout]).collect()
acollect = sc.parallelize([ele[1] for ele in parseout]).collect()

In [None]:
print "How many noun rows? ", len(ncollect)
print "How many adjective rows? ", len(acollect)

In [None]:
print ncollect[:3]

In [None]:
print acollect[:3]

In [None]:
# save ncollect
with open(root_out+'noun_collect.json', 'w') as fp:
    json.dump(ncollect, fp)

In [None]:
# save acollect
with open(root_out+'adj_collect.json', 'w') as fp:
    json.dump(acollect, fp)

###Unique words per lyric

In [None]:
# Word Reduction per document
def buildWordReduction(collected):
    ngram_reduced = []
    for r in collected:
        v = []
        for rr in r:
            for i in rr:
                if not i in v:
                    v.append(i)
        ngram_reduced.append(v)
    return ngram_reduced

In [None]:
nreduction = buildWordReduction(ncollect)
areduction = buildWordReduction(acollect)

In [None]:
nreduction[2]

In [None]:
# save noun word reduction
with open(root_out+'noun-word-reduction.json', 'w') as fp:
    json.dump(nreduction, fp)

In [None]:
# save adj word reduction
with open(root_out+'adj-word-reduction.json', 'w') as fp:
    json.dump(areduction, fp)

###N-Gram Specific
**Want Raw n-gram for total words, then reduced n-gram for 1x per document max**

In [None]:
# save noun n-gram (raw)
with open(root_out+'noun-n-gram.json', 'w') as fp:
    json.dump(dict(nwordsrdd.collect()), fp)

In [None]:
# save adjective n-gram (raw)
with open(root_out+'adj-n-gram.json', 'w') as fp:
    json.dump(dict(awordsrdd.collect()), fp)

In [None]:
# build from nreduction and areduction to get actual counts.
def buildNgramReduced(reduction):
    return (sc.parallelize(reduction)
          .flatMap(lambda word: word)
          .map(lambda word: (word, 1))
          .reduceByKey(lambda a, b: a + b)
       ).collect()

In [None]:
n_ngram_reduced = buildNgramReduced(nreduction)
a_ngram_reduced = buildNgramReduced(areduction)

In [None]:
# save reduced noun n-gram
with open(root_out+'noun_n-gram_reduced.json', 'w') as fp:
    json.dump(n_ngram_reduced, fp)

In [None]:
# save reduced adj n-gram
with open(root_out+'adj_n-gram_reduced.json', 'w') as fp:
    json.dump(a_ngram_reduced, fp)

###Vocab, id2word

In [None]:
# save noun vocab and id2word
with open(root_out+'nounvocab.json', 'w') as fp:
    json.dump(nounvocab, fp)
    
with open(root_out+'nounid2word.json', 'w') as fp:
    json.dump(nounid2word, fp)    

In [None]:
# save adj vocab and id2word
with open(root_out+'adjvocab.json', 'w') as fp:
    json.dump(adjvocab, fp)
    
with open(root_out+'adjid2word.json', 'w') as fp:
    json.dump(adjid2word, fp) 

###Corpus

In [None]:
# save corpus
pickle.dump( corpus, open( root_out+'corpus.p', "wb" ) )

##Synonyms

###Synonym Lookups
Focus on WordNet python package within [nltk](http://www.nltk.org) via [textblob](https://textblob.readthedocs.org/en/dev/)
The main idea is to lookup all words in the noun and adj vocab dictionaries and attempt to collapse down -- where possible -- to synonyms. The synonyms can be used for common_support also.

In [None]:
from textblob.wordnet import Synset
from textblob.wordnet import NOUN
from textblob.wordnet import ADJ

SIM_THRESHOLD = 1.0 # Only act on values at/above threshold

In [None]:
## COMMON METHODS FOR SYNSETS
def synsetStr(syn):
    """
    attempt to parse the string from a Synset, e.g. Synset('dog.n.01') would return 'dog'
    return String or None
    """
    try:
        return syn.name().split('.')[0]
    except Exception:
        return None
    
def flattenSynsetValues(syn_dict, skip_invalid=True, replace_invalid=None):
    """
    flatten synset values in dictionary using params
    """
    d = {}
    for k,v in syn_dict.iteritems():
        if v:
            d[k] = synsetStr(v)
        elif not skip_invalid:
            d[k] = replace_invalid
    return d

In [None]:
## CORE FUNCTIONS FOR BUILDING SIMILARITY MATRIX

def posToSingle(pos):
    """
    Keep up with which pos values are implemented.
    """
    if pos == NOUN:
        return "n"
    elif pos == ADJ:
        return "a"
    return None # essentially, else clause


def cachedSynsetOrBuild(idx, syns, p, id_lookup):
    """
    Build Synset for given `idx`, using the `id_lookup`.
    Facilitate O(n) computational complexity by caching results.
    
    --- Input ---
    idx: id to build and cache
    syns: existing dictionary of synsets, with k: id, v: Synset or None
    p: String pos value in the form needed for Synset generation, see `posToSingle`
    id_lookup: dictionary for noun / adj to build n x n matrix of similarity.
    
    --- Return ---
    Synset or None
    """
    if idx in syns:
        return syns[idx] 
        
    # focus on `.01` only
    try:                      
        syn = Synset("{}.{}.01".format(id_lookup[idx],p))
        syns[idx] = syn
        return syn
    except Exception:
        syns[idx] = None
        return None

def similarityMatrix(id2word, pos, take_n=None):
    """
    ##############################################################
    Build matrix of synsets for given id2word dictionary.    
    Optionally, only build a similarity matrix for the first n values.
    
    --- Input ---    
    id2word: dictionary for noun / adj to build n x n matrix of similarity.
    pos: WordNet position, `NOUN` or `ADJ` imported based on needs
    take_n: whether take the first n values for testing, default=None
    
    --- Return ---
    return a tuple, t where
    t[0]: n x n matrix with raw similarity score or zero
    t[1]: dictionary of synsets with k: id, v: Synset or None
    ##############################################################    
    """    
    syns = {} # obtain O(n)
    p = posToSingle(pos)
    
    # determine n
    n = len(id2word)
    if take_n:
        n = take_n
    
    # n x n matrix, initialized with zeros 
    matrix = np.zeros((n,n))
    
    # populate
    ns = range(n)
    for i in ns:  
        isyn = cachedSynsetOrBuild(i,syns,p,id2word)       
        for j in ns:
            # find j in synset
            jsyn = None
            if isyn:
                jsyn = cachedSynsetOrBuild(j,syns,p,id2word) # no reason unless isyn is ok
        
            # update matrix with path_similarity between i and j words
            if isyn and jsyn:            
                ps = isyn.path_similarity(jsyn)            
                if ps:
                    matrix[i][j] = ps
            
    return matrix, syns

In [None]:
## FUNCTIONS FOR EVALUATING SIMILARITY MATRIX RESULTS

def getSimilarityPairs(matrix, print_n=None, id_lookup=None, sim_threshold=SIM_THRESHOLD): 
    """
    print non zero similarities, ignoring diagonals.
    Optionally, show only first n non zeros then return.
    Optionally, lookup ids with words.
    Optionally, only evaluate values at/above a threshold.
    """
    
    pairs = []
    
    ns = range(len(matrix))      
    c = 0
    for i in ns:
        for j in ns:
            v = matrix[i][j] 
            
            # handle sim_threshold
            met_threshold = True
            if sim_threshold and v < sim_threshold:
                met_threshold = False
            elif not v:
                met_threshold = False
                    
            if (i != j) and met_threshold:                
                if not print_n or c < print_n:
                    c += 1
                    s_i = i
                    s_j = j
                    if id_lookup:
                        s_i = id_lookup[i]
                        s_j = id_lookup[j]
                    if print_n:    
                        print "{},{} --> {}".format(s_i,s_j,v)
                    pairs.append((s_i,s_j))
                elif print_n:
                    return pairs
    return pairs
                
def countSimilarityPairs(matrix, sim_threshold=SIM_THRESHOLD):
    """
    count non zero similarities, ignoring diagonals.
    Optionally, only evaluate values at/above a threshold.    
    """
    c = 0
    ns = range(len(matrix))         
    for i in ns:
        for j in ns:
            v = matrix[i][j]
            
            # handle sim_threshold
            met_threshold = True
            if sim_threshold and v < sim_threshold:
                met_threshold = False
            elif not v:
                met_threshold = False
            
            if (i != j) and met_threshold:                
                c += 1                    
    return c

In [None]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

In [None]:
%%time
# build adj similarity matrix
asimatrix, asyns = similarityMatrix(adjid2word, ADJ)

In [None]:
# Count non-zero similarities for adjectivies at/above SIM_THRESHOLD, ignoring diagonal
countSimilarityPairs(asimatrix)

In [None]:
# Check adj similarity results, are they any good?
getSimilarityPairs(asimatrix, print_n=10, id_lookup=adjid2word)

# build the actual (to be dumped) variables <-- NOTE: Hypernyms will be built from here!
asimpairs_words = getSimilarityPairs(asimatrix, id_lookup=adjid2word)
asimpairs_ids = getSimilarityPairs(asimatrix)

In [None]:
len(asimpairs_words)

In [None]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

In [None]:
%%time
# build noun similarity matrix (can take 30+ minutes!!!)
nsimatrix, nsyns = similarityMatrix(nounid2word, NOUN)

In [None]:
# Count non-zero similarities for nouns at/above SIM_THRESHOLD, ignoring diagonal
countSimilarityPairs(nsimatrix)

In [None]:
# Check noun similarity results, are they any good?
getSimilarityPairs(nsimatrix, print_n = 10, id_lookup=nounid2word)

# build the actual (to be dumped) variables <-- NOTE: Hypernyms will be built from here!
nsimpairs_words = getSimilarityPairs(nsimatrix, id_lookup=nounid2word)
nsimpairs_ids = getSimilarityPairs(nsimatrix)

In [None]:
len(nsimpairs_words)

###Save Synonym work
####Similarity Matrix and Synsets

In [None]:
# save asimatrix
pickle.dump( asimatrix, open(root_out+'asimatrix.p', "wb" ) )  

In [None]:
# flatten and save asyns
with open(root_out+'asyns.json', 'w') as fp:
    json.dump(flattenSynsetValues(asyns), fp)

In [None]:
# save nsimatrix
pickle.dump( nsimatrix, open(root_out+'nsimatrix.p', "wb" ) )


In [None]:
# flatten and save nsyns
with open(root_out+'nsyns.json', 'w') as fp:
    json.dump(flattenSynsetValues(nsyns), fp)

####Similarity Pairs

In [None]:
with open(root_out+'asimpairs_ids.json', 'w') as fp:
    json.dump(asimpairs_ids, fp)  

In [None]:
with open(root_out+'asimpairs_words.json', 'w') as fp:
    json.dump(asimpairs_words, fp)

In [None]:
with open(root_out+'nsimpairs_ids.json', 'w') as fp:
    json.dump(nsimpairs_words, fp)

In [None]:
with open(root_out+'nsimpairs_words.json', 'w') as fp:
    json.dump(nsimpairs_words, fp)

##Hypernyms
find the lowest common [hypernym](https://en.wikipedia.org/wiki/Hyponymy_and_hypernymy) between similar

In [None]:
#Quick Test
Synset('dog.n.01').lowest_common_hypernyms(Synset('cat.n.01'))[0]

In [None]:
# ## CORE FUNCTIONS FOR BUILDING HYPERNYM -- THIS USES SIMATRIX

# def makeOrderedTuple(idx1, idx2):
#     if idx1 > idx2:
#         return (idx2,idx1) 
#     return (idx1,idx2) 

# def cachedHypernymOrBuild(idx1, idx2, syn_lookup, hypes, hype_as_str=True):
#     """
#     Build Hypernym for given `idxtuple`, using the `syns_lookup`.
#     Facilitate O(n) computational complexity by caching results
#     Will internally manage hypernym keys as ordered tuple.
    
#     --- Input ---
#     idx: tuple of id to build and cache
#     syn_lookup: existing dictionary of synsets, with k: id, v: Synset or None    
#     hypes: dictionary for hypernyms with k: ordered tuple, v: hypernym.
#     hype_as_str: optional build map with string values, default = True
#     --- Return ---
#     a hypernym Synset or None
#     """
#     ituple = makeOrderedTuple(idx1,idx2)    
#     if ituple in hypes: 
#         return hypes[ituple] 
    
#     try:    
#         s1 = syn_lookup[ituple[0]]
#         s2 = syn_lookup[ituple[1]]
#         h = s1.lowest_common_hypernyms(s2)[0]
        
#         if hype_as_str:
#             h = synsetStr(h)
            
#         hypes[ituple] = h
#         return h
#     except Exception:
#         hypes[ituple] = None
#         return None

# def lowestCommonHypernyms(simatrix, syn_lookup, sim_threshold=SIM_THRESHOLD, hype_as_str=True):
#     """
#     Build a matrix with hypernym where found.
#     Optionally, only evaluate values at/above a threshold.
    
#     --- Input ---
#     simatrix: tuple of id to build and cache
#     syn_lookup: existing dictionary of synsets, with k: id, v: Synset or None    
#     sim_threshold: optional threshold to use for establishing hypernyms, default = SIM_THRESHOLD
#     hype_as_str: optional build map with string values, default = True
    
#     --- Return ---
#     dictionary for hypernyms with k: ordered tuple, v: Synset.    
#     """
    
#     hypes = {} # dictionary to build up.
    
#     n = len(simatrix)
#     ns = range(n)          
#     for i in ns:
#         for j in ns:
#             v = simatrix[i][j] 
            
#             # handle sim_threshold
#             met_threshold = True
#             if sim_threshold and v < sim_threshold:
#                 met_threshold = False
#             elif not v:
#                 met_threshold = False
                    
#             if (i != j) and met_threshold:                                
#                 cachedHypernymOrBuild(i,j, syn_lookup, hypes, hype_as_str)
                
#     return hypes


In [None]:
## CORE FUNCTIONS FOR BUILDING HYPERNYM -- THIS USES SIMPAIR

def makeOrderedTuple(idx1, idx2):
    if idx1 > idx2:
        return (idx2,idx1) 
    return (idx1,idx2) 

def lowestCommonHypernyms(simpair_words, syn_pos, hype_as_str=True):
    """
    Build a dict with hypernym where found.
    
    --- Input ---
    simpair_words: tuple of words to build and cache
    p: part
    hype_as_str: optional build map with string values, default = True
    
    --- Return ---
    dictionary for hypernyms with k: ordered tuple, v: Synset | String .    
    """
    
    hypes = {} # dictionary to build up.
    
    for ts in simpair_words:          
        ituple = makeOrderedTuple(ts[0],ts[1])    
        if ituple not in hypes: 
            try:                   
                s1 = Synset("{}.{}.01".format(ituple[0],syn_pos))
                s2 = Synset("{}.{}.01".format(ituple[1],syn_pos))
                h = s1.lowest_common_hypernyms(s2)[0]
                
                if hype_as_str:
                    h = synsetStr(h)
                    
                hypes[ituple] = h
                
            except Exception:
                hypes[ituple] = None
                
    return hypes

In [None]:
## FUNCTIONS FOR EVALUATING HYPERNYMS

def countHypernyms(hypes, count_valid=True, count_invalid=True):
    """
    Count  hypernyms, ignoring None
    """
    c = 0
    for k,v in hypes.iteritems():
        if count_valid and v:
            c += 1
        elif count_invalid and not v:
            c += 1        
    return c

###Adjective Hypernyms

In [None]:
# find adj hypernyms, defaulting to only the string value
ahypes = lowestCommonHypernyms(asimpairs_words, ADJ)

In [None]:
# check results
print "how many adj hypernyms? ", countHypernyms(ahypes)
print "how many valid adj hypernyms? ", countHypernyms(ahypes, count_valid=True, count_invalid=False)
print "how many invalid adj hypernyms? ", countHypernyms(ahypes, count_valid=False, count_invalid=True)
print "example key: {}, value: {}".format(ahypes.keys()[0],ahypes[ahypes.keys()[0]])

In [None]:
ahypes

###Noun Hypernyms

In [None]:
# find noun hypernyms
nhypes = lowestCommonHypernyms(nsimpairs_words, NOUN)

In [None]:
# check results
print "how many noun hypernyms? ", countHypernyms(nhypes)
print "how many valid noun hypernyms? ", countHypernyms(nhypes, count_valid=True, count_invalid=False)
print "how many invalid noun hypernyms? ", countHypernyms(nhypes, count_valid=False, count_invalid=True)
print "example key: {}, value: {}".format(nhypes.keys()[0],nhypes[nhypes.keys()[0]])

In [None]:
nhypes

##Save Hypernyms

In [None]:
# save adj hypernyms
pickle.dump( ahypes, open(root_out+'ahypes.p', "wb" ) )  

In [None]:
# save noun hypernyms
pickle.dump( nhypes, open(root_out+'nhypes.p', "wb" ) )  

In [None]:
# New: do some conversion for a json file

def saveHypesAsJson(hypes,json_name,root_out=root_out):
    h = {}
    hkeys = [] #hypernym keys
    
    for ts,v in hypes.iteritems():
        if not v in hkeys:
            hkeys.append(v)
            
    for ts,v in hypes.iteritems():
        if v in h:
            s = h[v]
            if ts[0] not in s:
                s.append(ts[0])
            if ts[1] not in s:
                s.append(ts[1])
        else:
            h[v] = []
            h[v].append(ts[0])
            h[v].append(ts[1])
    
    # save h
    with open(root_out+ json_name + '.json', 'w') as fp:
        json.dump(h, fp)
        
    return h

In [None]:
njhypes = saveHypesAsJson(nhypes,'noun_hype_syns_words')
njhypes

In [None]:
ajhypes = saveHypesAsJson(ahypes,'adj_hype_syns_words')
ajhypes

###Compare Synonym and Hypernym Lists

In [None]:
def compareWordLists(alist,blist):
    same = []
    ina = []
    inb = []
    
    for a in alist:
        if a in blist:
            same.append(a)
        else:
            ina.append(a)
    
    for b in blist:
        if b not in same:
            inb.append(b)
    return sorted(same), sorted(ina), sorted(inb)

In [None]:
tncomp = compareWordLists(flattenSynsetValues(nsyns).values(),njhypes.values())
print "For FULL noun syn versus syn-hype words..."
print "\tHow many are same? ", len(tncomp[0])
print "\tHow many are only in syn? ", len(tncomp[1])
print "\tHow many are only in hype? ", len(tncomp[2])
print
tacomp = compareWordLists(flattenSynsetValues(asyns).values(),ajhypes.values())
print "For FULL adj syn versus syn-hype words..."
print "\tHow many are same? ", len(tacomp[0])
print "\tHow many are only in syn? ", len(tacomp[1])
print "\tHow many are only in hype? ", len(tacomp[2])

In [None]:
def flattenListOfLists(alist):
    v = []
    for a in alist:
        for x in a:
            v.append(x)
    return v

In [None]:
incomp = compareWordLists(flattenListOfLists(njhypes.values()),njhypes.keys())
print "For ONLY hypernym relevant nouns, syn versus hype words..."
print "\tHow many are same? ", len(incomp[0])
print "\tHow many are only in syn? ", len(incomp[1])
print "\tHow many are only in hype? ", len(incomp[2])
print
iacomp = compareWordLists(flattenListOfLists(ajhypes.values()),ajhypes.keys())
print "For ONLY hypernym relevant adj, syn versus hype words..."
print "\tHow many are same? ", len(iacomp[0])
print "\tHow many are only in syn? ", len(iacomp[1])
print "\tHow many are only in hype? ", len(iacomp[2])