#Vocab Consolidation
### Adapted concepts from [HW1](https://github.com/cs109-students/michaeljohns-2015hw/blob/hw1/hw1.ipynb) and [HW5 Part1](https://github.com/cs109-students/michaeljohns-2015hw/blob/hw5/hw5part1.ipynb)

**This notebook should be locally run by issuing `vagrant up` from project root, then locating the notebook at "http:\\localhost:4545". You may also need to issue `vagrant provision` to update any required resources.**

The following artifacts will be established by manipulating the output of the processing pipeline for harvesting data, file [use-this-master-lyricsdf-extracted.csv](../../data/conditioned/use-this-master-lyricsdf-extracted.csv):
* vocabs for noun and adj
* n-gram for noun and adj
* synonyms for noun and adj
* hypernyms for noun and adj

Other notes:
* this notebook leverages and finalizes exploratory work in [Data-Exploration Notebook](Data-Exploration.ipynb).
* outputs are anticipated to be combined in follow-on work for better latent factors, prediction, and recommendation processing (not reflected here)
* in other notebooks that use the exact same contents as here, we will establish n-gram and vocab per decade.



In [1]:
## SET THE DECADE FOR PROCESS FILTERING
## THIS WILL ALLOW SPECIAL PROCESSING
# decade = None # for no decade filtering, i.e. corpus-wide
# decade = 1970
# decade = 1980
# decade = 1990
# decade = 2000
decade = 2010

##Imports

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [3]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

##Handle Directory for Output

In [4]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
# create requisite directory for processing
root_out = ""
if not decade:
    root_out = "../../data/conditioned/corpus_vocabs/" #entire corpus
else:
    root_out = "../../data/conditioned/decades/"+str(decade)+"/" #single decade
    
assureDirExists(root_out)

##Spark Setup

In [6]:
import os
# os.environ['PYSPARK_PYTHON'] = '/anaconda/bin/python'

In [7]:
import findspark
findspark.init()
print findspark.find()
# Depending on your setup you might have to change this line of code
#findspark makes sure I dont need the below on homebrew.
#os.environ['SPARK_HOME']="/usr/local/Cellar/apache-spark/1.5.1/libexec/"
#the below actually broke my spark, so I removed it. 
#Depending on how you started the notebook, you might need it.
# os.environ['PYSPARK_SUBMIT_ARGS']="--master local pyspark --executor-memory 4g"

/home/vagrant/spark


In [8]:
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local[4]')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)

In [9]:
sc._conf.getAll()

[(u'spark.executor.memory', u'2g'),
 (u'spark.master', u'local[4]'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.driver.memory', u'8g'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.app.name', u'pyspark')]

In [10]:
import sys
rdd = sc.parallelize(xrange(10),10)
rdd.map(lambda x: sys.version).collect()

['2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]',
 

In [11]:
sys.version

'2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [12]:
from pyspark.sql import SQLContext
sqlsc=SQLContext(sc)

#Load Finalized Conditioned Data Into Pandas Dataframe

In [13]:
# load the lyrics from the approved "master" dataframe
lyrics_pd_df = pd.read_csv("../../data/conditioned/use-this-master-lyricsdf-extracted.csv")  

In [14]:
#FILTER BY DECADE IF SET
if decade:
    lyrics_pd_df = lyrics_pd_df[lyrics_pd_df['decade'] == decade]

In [15]:
lyrics_pd_df.shape

(500, 11)

In [16]:
lyrics_pd_df.head()

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract
4000,4000,1,2010,https://en.wikipedia.org/wiki/Tik_Tok,Tik Tok,Ke$ha,"Wake up in the morning, feelin' like P. Diddy ...",2010,2010-1,http://lyrics.wikia.com/Ke%24ha:Tik_Tok,"Wake up in the morning, feelin' like P. Diddy ..."
4001,4001,2,2010,https://en.wikipedia.org/wiki/Need_You_Now_(La...,Need You Now,Lady Antebellum,"Picture perfect memories, scattered all around...",2010,2010-2,http://lyrics.wikia.com/Lady_Antebellum:Need_Y...,"Picture perfect memories, scattered all around..."
4002,4002,3,2010,"https://en.wikipedia.org/wiki/Hey,_Soul_Sister","Hey, Soul Sister",Train,"Hey, hey, hey. Your lipstick stains. On the fr...",2010,2010-3,http://lyrics.wikia.com/Train:Hey%2C_Soul_Sister,"Hey, hey, hey. Your lipstick stains. On the fr..."
4003,4003,4,2010,https://en.wikipedia.org/wiki/California_Gurls,California Gurls,Katy Perry,. Greetings loved ones. Let's take a journey. ...,2010,2010-4,http://lyrics.wikia.com/Katy_Perry:California_...,. Greetings loved ones. Let's take a journey. ...
4004,4004,5,2010,https://en.wikipedia.org/wiki/OMG_(song),OMG,Usher,"Oh my gosh. Baby, let me. I did it again, so I...",2010,2010-5,http://lyrics.wikia.com/Usher:OMG,"Oh my gosh. Baby, let me. I did it again, so I..."


##Manipulate With Spark

In [17]:
# convert from pandas to spark dataframe
lyricsdf = sqlsc.createDataFrame(lyrics_pd_df)

In [18]:
# view output
lyricsdf.show(3)

+-----+--------+----+--------------------+----------------+---------------+--------------------+------+--------+--------------------+--------------------+
|index|position|year|          title.href|           title|         artist|              lyrics|decade|song_key|          lyrics_url|     lyrics_abstract|
+-----+--------+----+--------------------+----------------+---------------+--------------------+------+--------+--------------------+--------------------+
| 4000|       1|2010|https://en.wikipe...|         Tik Tok|          Ke$ha|Wake up in the mo...|  2010|  2010-1|http://lyrics.wik...|Wake up in the mo...|
| 4001|       2|2010|https://en.wikipe...|    Need You Now|Lady Antebellum|Picture perfect m...|  2010|  2010-2|http://lyrics.wik...|Picture perfect m...|
| 4002|       3|2010|https://en.wikipe...|Hey, Soul Sister|          Train|Hey, hey, hey. Yo...|  2010|  2010-3|http://lyrics.wik...|Hey, hey, hey. Yo...|
+-----+--------+----+--------------------+----------------+-----------

In [19]:
#view output
lyricsdf.show(3)

+-----+--------+----+--------------------+----------------+---------------+--------------------+------+--------+--------------------+--------------------+
|index|position|year|          title.href|           title|         artist|              lyrics|decade|song_key|          lyrics_url|     lyrics_abstract|
+-----+--------+----+--------------------+----------------+---------------+--------------------+------+--------+--------------------+--------------------+
| 4000|       1|2010|https://en.wikipe...|         Tik Tok|          Ke$ha|Wake up in the mo...|  2010|  2010-1|http://lyrics.wik...|Wake up in the mo...|
| 4001|       2|2010|https://en.wikipe...|    Need You Now|Lady Antebellum|Picture perfect m...|  2010|  2010-2|http://lyrics.wik...|Picture perfect m...|
| 4002|       3|2010|https://en.wikipe...|Hey, Soul Sister|          Train|Hey, hey, hey. Yo...|  2010|  2010-3|http://lyrics.wik...|Hey, hey, hey. Yo...|
+-----+--------+----+--------------------+----------------+-----------

In [20]:
#We cache the data to make sure it is only read once from disk
lyricsdf.cache()
print "How many songs do we have?", lyricsdf.count()

How many songs do we have? 500


In [21]:
print "What is the schema?", lyricsdf.printSchema()

What is the schema? root
 |-- index: long (nullable = true)
 |-- position: long (nullable = true)
 |-- year: long (nullable = true)
 |-- title.href: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- lyrics: string (nullable = true)
 |-- decade: long (nullable = true)
 |-- song_key: string (nullable = true)
 |-- lyrics_url: string (nullable = true)
 |-- lyrics_abstract: string (nullable = true)

None


##Sample Lyrics (or Not)

Some initial sampling to take from each year.

In [22]:
# whether or not to sample lyrics, and how many to sample per year
sample_lyrics = False
PER_YEAR_SAMPLES=10

In [23]:
#(your code here)
def randomSubSampleLyrics(sparkdf,take=PER_YEAR_SAMPLES):    
    # generate spark pairs as a tuple
    br_pairs = sparkdf.map(lambda r: (r.year, r.song_key))
    
    # group by key for a list of reviews per business and collect
    br_grouped = br_pairs.groupByKey().mapValues(lambda x: list(x)).collect()
        
    #sample after collect
    br_sample = [np.random.choice(v, size=take, replace=False) for k,v in br_grouped]    
    
    #flatten into a list
    return list(itertools.chain.from_iterable(br_sample))
    
small_song_keys = randomSubSampleLyrics(lyricsdf)

In [24]:
if sample_lyrics:
    print "How many small_song_keys? ", len(small_song_keys)
    small_song_keys[:5]
else:
    print "No lyric sampling, full processing (change `sample_lyrics` value to `True` to sample)"

No lyric sampling, full processing (change `sample_lyrics` value to `True` to sample)


In [25]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

execution start --> Fri, 04 Dec 2015 18:33:35


In [26]:
%%time
#(your code here)
if sample_lyrics:
    ldf=lyricsdf[lyricsdf.song_key.isin(small_song_keys)]#creates new dataframe
else:
    ldf=lyricsdf

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 14.1 µs


In [27]:
# cache results
ldf.cache()

DataFrame[index: bigint, position: bigint, year: bigint, title.href: string, title: string, artist: string, lyrics: string, decade: bigint, song_key: string, lyrics_url: string, lyrics_abstract: string]

In [28]:
print "How many lyrics are in ldf? ", ldf.count()

How many lyrics are in ldf?  500


##NLP

In [29]:
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [30]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS

In [31]:
import re
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")

In [32]:
print "Quick Test of parse..."
parse("The world is the craziest place. I am working hard.", tokenize=True, lemmata=True)

Quick Test of parse...


u'The/DT/B-NP/O/the world/NN/I-NP/O/world is/VBZ/B-VP/O/be the/DT/B-NP/O/the craziest/JJ/I-NP/O/craziest place/NN/I-NP/O/place ././O/O/.\nI/PRP/B-NP/O/i am/VBP/B-VP/O/be working/VBG/I-VP/O/work hard/RB/B-ADVP/O/hard ././O/O/.'

In [33]:
def get_parts(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [34]:
print "Quick check of get_parts ..."
get_parts("Have had many other items and just love the food. The patio...job was and...perfect. Lunch is good, and the only egg is great")

Quick check of get_parts ...


([[u'patio', u'job'], [u'lunch', u'egg']], [[u'perfect'], [u'good', u'great']])

###Run Get Parts on Provided Data

In [35]:
#(your code here)
lyric_parts = ldf.map(lambda r : get_parts(r.lyrics))

In [36]:
# view output
lyric_parts.take(2)

[([[u'drop-topping', u'cd'],
   [u'bit'],
   [u'tonight', u'fight'],
   [u'tonight', u'fight'],
   [u'drunk'],
   [u'tonight', u'fight'],
   [u'tonight', u'fight'],
   [u'tonight', u'fight'],
   [u'tonight', u'fight']],
  [[u'favorite'],
   [u'little', u'tipsy'],
   [u'mma'],
   [u'mma'],
   [u'drunk'],
   [u'mma'],
   [u'mma'],
   [u'mma'],
   [u'mma']]),
 ([[u'memory', u'floor'], [u'quarter', u'drunk'], [u'drunk']],
  [[u'picture', u'perfect'], [u'little'], [u'little']])]

In [37]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

execution start --> Fri, 04 Dec 2015 18:33:37


In [38]:
%%time
parseout=lyric_parts.collect()

CPU times: user 8.45 ms, sys: 12.5 ms, total: 20.9 ms
Wall time: 11 s


##Vocab
###Nouns

In [39]:
print "How many parseout entries? ", len(parseout)

How many parseout entries?  500


In [40]:
# flatten parseout to create initial noun rdd
nounrdd=sc.parallelize([ele[0] for ele in parseout]).flatMap(lambda l: l)

In [41]:
# view output
nounrdd.take(5)

[[u'drop-topping', u'cd'],
 [u'bit'],
 [u'tonight', u'fight'],
 [u'tonight', u'fight'],
 [u'drunk']]

In [42]:
# cache results
nounrdd.cache()

PythonRDD[34] at RDD at PythonRDD.scala:43

In [43]:
# straight reduce for overall word counts
nwordsrdd = (nounrdd.flatMap(lambda word: word)
             .map(lambda word: (word, 1))
             .reduceByKey(lambda a, b: a + b)
)

In [44]:
# view output
nwordsrdd.take(5)

[(u'jaja', 1), (u'liar', 2), (u'dance', 22), (u'diesel', 1), (u'dollar', 22)]

In [45]:
# top n, based on values, sorted descending
nwordsrdd.takeOrdered(10, key = lambda x: -x[1])

[(u'baby', 309),
 (u'girl', 276),
 (u'night', 199),
 (u'time', 178),
 (u'love', 165),
 (u'thing', 154),
 (u'bitch', 145),
 (u'life', 126),
 (u'tonight', 109),
 (u'way', 98)]

In [46]:
nwordsrdd.cache()

PythonRDD[41] at RDD at PythonRDD.scala:43

In [47]:
# collect all the words and cache
nounvocabtups = (nwordsrdd
             .map(lambda (x,y): x)
             .zipWithIndex()
)

In [48]:
# view output
nounvocabtups.take(3)

[(u'jaja', 0), (u'liar', 1), (u'dance', 2)]

In [49]:
# cache results
nounvocabtups.cache()

PythonRDD[44] at RDD at PythonRDD.scala:43

In [50]:
# collect results
nounvocab=nounvocabtups.collectAsMap()
nounid2word=nounvocabtups.map(lambda (x,y): (y,x)).collectAsMap()

In [51]:
# since sampling may be used, avoiding more common usage, e.g. `nounvocab['dance']`
nounid2word[0], nounvocab.keys()[5], nounvocab[nounvocab.keys()[5]]

(u'jaja', u'swag', 807)

In [52]:
print "How big is the noun vocabulary? ", len(nounvocab.keys())

How big is the noun vocabulary?  1605


###Adjectives

In [53]:
# create initial adj rdd from parseout
adjrdd=sc.parallelize([ele[1] for ele in parseout])

In [54]:
# view output
adjrdd.take(3)

[[[u'favorite'],
  [u'little', u'tipsy'],
  [u'mma'],
  [u'mma'],
  [u'drunk'],
  [u'mma'],
  [u'mma'],
  [u'mma'],
  [u'mma']],
 [[u'picture', u'perfect'], [u'little'], [u'little']],
 [[u'sweet'],
  [u'single'],
  [u'fair'],
  [u'single'],
  [u'glad', u'one-track'],
  [u'untrimmed'],
  [u'fair'],
  [u'single'],
  [u'fair'],
  [u'single'],
  [u'single']]]

In [55]:
# cache results
adjrdd.cache()

ParallelCollectionRDD[46] at parallelize at PythonRDD.scala:423

In [56]:
# straight reduce for overall word counts
awordsrdd = (adjrdd
             .flatMap(lambda l: l)
             .flatMap(lambda word: word)
             .map(lambda word: (word, 1))
             .reduceByKey(lambda a, b: a + b)
)

In [57]:
# view output
awordsrdd.take(5)

[(u"li'l", 1),
 (u'treble', 17),
 (u'super', 21),
 (u'convinced', 1),
 (u'true', 29)]

In [58]:
# top n, based on values, sorted descending
awordsrdd.takeOrdered(10, key = lambda x: -x[1])

[(u'good', 339),
 (u'little', 217),
 (u'bad', 200),
 (u'real', 147),
 (u'ooh', 101),
 (u'big', 92),
 (u'new', 91),
 (u'best', 85),
 (u'bright', 72),
 (u'high', 68)]

In [59]:
# cache results
awordsrdd.cache()

PythonRDD[54] at RDD at PythonRDD.scala:43

In [60]:
#(your code here)
adjvocabtups = (awordsrdd
              .map(lambda (x,y): x)
              .zipWithIndex()
)

In [61]:
# view output
adjvocabtups.take(3)

[(u"li'l", 0), (u'treble', 1), (u'super', 2)]

In [62]:
# cache results
adjvocabtups.cache()

PythonRDD[57] at RDD at PythonRDD.scala:43

In [63]:
# collect results
adjvocab=adjvocabtups.collectAsMap()
adjid2word=adjvocabtups.map(lambda (x,y): (y,x)).collectAsMap()

In [64]:
# since sampling may be used, avoiding more common usage, e.g. `adjvocab['exotic']`
adjid2word[0], adjvocab.keys()[5], adjvocab[adjvocab.keys()[5]]

(u"li'l", u'softer', 803)

In [65]:
print "How big is the adjective vocabulary? ", len(adjvocab)

How big is the adjective vocabulary?  918


##Document Corpus

In [66]:
##################################################################################################
# CITATION - Use of counter for reduce within each word list from:
# http://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item-in-python
##################################################################################################
from collections import Counter

# for each sentence, reduct into a list of tuple k,v where k=vocab index and v=count, 
# each word list is sorted by occurence
documents = nounrdd.map(lambda words: Counter([nounvocab[word] for word in words]).most_common())

In [67]:
# verify output
documents.take(1)

[[(522, 1), (1453, 1)]]

In [68]:
# gather spark results
corpus=documents.collect()

##Save Spark Conditioning

###Part of Speech Nouns / Adjectives (Original Lyrics Array)

In [69]:
ncollect = sc.parallelize([ele[0] for ele in parseout]).collect()
acollect = sc.parallelize([ele[1] for ele in parseout]).collect()

In [70]:
print "How many noun rows? ", len(ncollect)
print "How many adjective rows? ", len(acollect)

How many noun rows?  500
How many adjective rows?  500


In [71]:
print ncollect[:3]

[[[u'drop-topping', u'cd'], [u'bit'], [u'tonight', u'fight'], [u'tonight', u'fight'], [u'drunk'], [u'tonight', u'fight'], [u'tonight', u'fight'], [u'tonight', u'fight'], [u'tonight', u'fight']], [[u'memory', u'floor'], [u'quarter', u'drunk'], [u'drunk']], [[u'moonbeam'], [u'smell', u'dream'], [u'way'], [u'soul', u'sister', u'thing'], [u'mind'], [u'heart', u'right', u'chest'], [u'way'], [u'soul', u'sister', u'thing'], [u'way'], [u'soul', u'sister', u'thing'], [u'soul', u'sister', u'thing']]]


In [72]:
print acollect[:3]

[[[u'favorite'], [u'little', u'tipsy'], [u'mma'], [u'mma'], [u'drunk'], [u'mma'], [u'mma'], [u'mma'], [u'mma']], [[u'picture', u'perfect'], [u'little'], [u'little']], [[u'sweet'], [u'single'], [u'fair'], [u'single'], [u'glad', u'one-track'], [u'untrimmed'], [u'fair'], [u'single'], [u'fair'], [u'single'], [u'single']]]


In [73]:
# save ncollect
with open(root_out+'noun_collect.json', 'w') as fp:
    json.dump(ncollect, fp)

In [74]:
# save acollect
with open(root_out+'adj_collect.json', 'w') as fp:
    json.dump(acollect, fp)

###Unique words per lyric

In [75]:
# Word Reduction per document
def buildWordReduction(collected):
    ngram_reduced = []
    for r in collected:
        v = []
        for rr in r:
            for i in rr:
                if not i in v:
                    v.append(i)
        ngram_reduced.append(v)
    return ngram_reduced

In [76]:
nreduction = buildWordReduction(ncollect)
areduction = buildWordReduction(acollect)

In [77]:
nreduction[2]

[u'moonbeam',
 u'smell',
 u'dream',
 u'way',
 u'soul',
 u'sister',
 u'thing',
 u'mind',
 u'heart',
 u'right',
 u'chest']

In [78]:
# save noun word reduction
with open(root_out+'noun-word-reduction.json', 'w') as fp:
    json.dump(nreduction, fp)

In [79]:
# save adj word reduction
with open(root_out+'adj-word-reduction.json', 'w') as fp:
    json.dump(areduction, fp)

###N-Gram Specific
**Want Raw n-gram for total words, then reduced n-gram for 1x per document max**

In [80]:
# save noun n-gram (raw)
with open(root_out+'noun-n-gram.json', 'w') as fp:
    json.dump(dict(nwordsrdd.collect()), fp)

In [81]:
# save adjective n-gram (raw)
with open(root_out+'adj-n-gram.json', 'w') as fp:
    json.dump(dict(awordsrdd.collect()), fp)

In [82]:
# build from nreduction and areduction to get actual counts.
def buildNgramReduced(reduction):
    return (sc.parallelize(reduction)
          .flatMap(lambda word: word)
          .map(lambda word: (word, 1))
          .reduceByKey(lambda a, b: a + b)
       ).collect()

In [83]:
n_ngram_reduced = buildNgramReduced(nreduction)
a_ngram_reduced = buildNgramReduced(areduction)

In [84]:
# save reduced noun n-gram
with open(root_out+'noun_n-gram_reduced.json', 'w') as fp:
    json.dump(n_ngram_reduced, fp)

In [85]:
# save reduced adj n-gram
with open(root_out+'adj_n-gram_reduced.json', 'w') as fp:
    json.dump(a_ngram_reduced, fp)

###Vocab, id2word

In [86]:
# save noun vocab and id2word
with open(root_out+'nounvocab.json', 'w') as fp:
    json.dump(nounvocab, fp)
    
with open(root_out+'nounid2word.json', 'w') as fp:
    json.dump(nounid2word, fp)    

In [87]:
# save adj vocab and id2word
with open(root_out+'adjvocab.json', 'w') as fp:
    json.dump(adjvocab, fp)
    
with open(root_out+'adjid2word.json', 'w') as fp:
    json.dump(adjid2word, fp) 

###Corpus

In [88]:
# save corpus
pickle.dump( corpus, open( root_out+'corpus.p', "wb" ) )

##Synonyms

###Synonym Lookups
Focus on WordNet python package within [nltk](http://www.nltk.org) via [textblob](https://textblob.readthedocs.org/en/dev/)
The main idea is to lookup all words in the noun and adj vocab dictionaries and attempt to collapse down -- where possible -- to synonyms. The synonyms can be used for common_support also.

In [89]:
from textblob.wordnet import Synset
from textblob.wordnet import NOUN
from textblob.wordnet import ADJ

SIM_THRESHOLD = 1.0 # Only act on values at/above threshold

In [90]:
## COMMON METHODS FOR SYNSETS
def synsetStr(syn):
    """
    attempt to parse the string from a Synset, e.g. Synset('dog.n.01') would return 'dog'
    return String or None
    """
    try:
        return syn.name().split('.')[0]
    except Exception:
        return None
    
def flattenSynsetValues(syn_dict, skip_invalid=True, replace_invalid=None):
    """
    flatten synset values in dictionary using params
    """
    d = {}
    for k,v in syn_dict.iteritems():
        if v:
            d[k] = synsetStr(v)
        elif not skip_invalid:
            d[k] = replace_invalid
    return d

In [91]:
## CORE FUNCTIONS FOR BUILDING SIMILARITY MATRIX

def posToSingle(pos):
    """
    Keep up with which pos values are implemented.
    """
    if pos == NOUN:
        return "n"
    elif pos == ADJ:
        return "a"
    return None # essentially, else clause


def cachedSynsetOrBuild(idx, syns, p, id_lookup):
    """
    Build Synset for given `idx`, using the `id_lookup`.
    Facilitate O(n) computational complexity by caching results.
    
    --- Input ---
    idx: id to build and cache
    syns: existing dictionary of synsets, with k: id, v: Synset or None
    p: String pos value in the form needed for Synset generation, see `posToSingle`
    id_lookup: dictionary for noun / adj to build n x n matrix of similarity.
    
    --- Return ---
    Synset or None
    """
    if idx in syns:
        return syns[idx] 
        
    # focus on `.01` only
    try:                      
        syn = Synset("{}.{}.01".format(id_lookup[idx],p))
        syns[idx] = syn
        return syn
    except Exception:
        syns[idx] = None
        return None

def similarityMatrix(id2word, pos, take_n=None):
    """
    ##############################################################
    Build matrix of synsets for given id2word dictionary.    
    Optionally, only build a similarity matrix for the first n values.
    
    --- Input ---    
    id2word: dictionary for noun / adj to build n x n matrix of similarity.
    pos: WordNet position, `NOUN` or `ADJ` imported based on needs
    take_n: whether take the first n values for testing, default=None
    
    --- Return ---
    return a tuple, t where
    t[0]: n x n matrix with raw similarity score or zero
    t[1]: dictionary of synsets with k: id, v: Synset or None
    ##############################################################    
    """    
    syns = {} # obtain O(n)
    p = posToSingle(pos)
    
    # determine n
    n = len(id2word)
    if take_n:
        n = take_n
    
    # n x n matrix, initialized with zeros 
    matrix = np.zeros((n,n))
    
    # populate
    ns = range(n)
    for i in ns:  
        isyn = cachedSynsetOrBuild(i,syns,p,id2word)       
        for j in ns:
            # find j in synset
            jsyn = None
            if isyn:
                jsyn = cachedSynsetOrBuild(j,syns,p,id2word) # no reason unless isyn is ok
        
            # update matrix with path_similarity between i and j words
            if isyn and jsyn:            
                ps = isyn.path_similarity(jsyn)            
                if ps:
                    matrix[i][j] = ps
            
    return matrix, syns

In [92]:
## FUNCTIONS FOR EVALUATING SIMILARITY MATRIX RESULTS

def printSimilarityPairs(matrix, show_n=None, id_lookup=None, sim_threshold=SIM_THRESHOLD): 
    """
    print non zero similarities, ignoring diagonals.
    Optionally, show only first n non zeros then return.
    Optionally, lookup ids with words.
    Optionally, only evaluate values at/above a threshold.
    """
    ns = range(len(matrix))      
    c = 0
    for i in ns:
        for j in ns:
            v = matrix[i][j] 
            
            # handle sim_threshold
            met_threshold = True
            if sim_threshold and v < sim_threshold:
                met_threshold = False
            elif not v:
                met_threshold = False
                    
            if (i != j) and met_threshold:                
                if not show_n or c < show_n:
                    c += 1
                    s_i = i
                    s_j = j
                    if id_lookup:
                        s_i = id_lookup[i]
                        s_j = id_lookup[j]
                    print "{},{} --> {}".format(s_i,s_j,v)
                elif show_n:
                    return
                
def countSimilarityPairs(matrix, sim_threshold=SIM_THRESHOLD):
    """
    count non zero similarities, ignoring diagonals.
    Optionally, only evaluate values at/above a threshold.    
    """
    c = 0
    ns = range(len(matrix))         
    for i in ns:
        for j in ns:
            v = matrix[i][j]
            
            # handle sim_threshold
            met_threshold = True
            if sim_threshold and v < sim_threshold:
                met_threshold = False
            elif not v:
                met_threshold = False
            
            if (i != j) and met_threshold:                
                c += 1                    
    return c

In [93]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

execution start --> Fri, 04 Dec 2015 18:33:53


In [94]:
%%time
# build adj similarity matrix
asimatrix, asyns = similarityMatrix(adjid2word, ADJ)

CPU times: user 3.11 s, sys: 15.3 ms, total: 3.12 s
Wall time: 3.26 s


In [95]:
# Count non-zero similarities for adjectivies at/above SIM_THRESHOLD, ignoring diagonal
countSimilarityPairs(asimatrix)

50

In [96]:
# Check adj similarity results, are they any good?
printSimilarityPairs(asimatrix, show_n=10, id_lookup=adjid2word)

fine,okay --> 1.0
ruby,cherry --> 1.0
ruby,red --> 1.0
naked,bare --> 1.0
small,little --> 1.0
bare,naked --> 1.0
large,big --> 1.0
silly,wacky --> 1.0
yucky,foul --> 1.0
okay,fine --> 1.0


In [97]:
print "execution start --> {}".format(time.strftime('%a, %d %b %Y %H:%M:%S', time.localtime()))

execution start --> Fri, 04 Dec 2015 18:33:57


In [98]:
%%time
"""
NOTE: UNCOMMENT TO BUILD FRESH!!!
"""
# build noun similarity matrix (can take 30+ minutes!!!)
nsimatrix, nsyns = similarityMatrix(nounid2word, NOUN)

print


CPU times: user 2min 34s, sys: 1.12 s, total: 2min 36s
Wall time: 2min 36s


In [99]:
"""
NOTE: READING BACK IN FROM FILE!!!

# load nsimatrix from file
nsimatrix = pickle.load( open( root_out+"nsimatrix.p", "rb" ) )

# load nsyns from file
with open(root_out+'nsyns.json', 'r') as fp:
    nsyns = json.load(fp)
"""
print




In [100]:
# Count non-zero similarities for nouns at/above SIM_THRESHOLD, ignoring diagonal
countSimilarityPairs(nsimatrix)

102

In [101]:
# Check noun similarity results, are they any good?
printSimilarityPairs(nsimatrix, show_n = 10, id_lookup=nounid2word)

cheerio,goodbye --> 1.0
babe,baby --> 1.0
baby,babe --> 1.0
drama,play --> 1.0
dad,dada --> 1.0
dad,pop --> 1.0
dad,daddy --> 1.0
spot,place --> 1.0
tale,story --> 1.0
somebody,person --> 1.0


## Save Similarity Matrix


In [102]:
# save asimatrix
pickle.dump( asimatrix, open(root_out+'asimatrix.p', "wb" ) )  

In [103]:
# flatten and save asyns
with open(root_out+'asyns.json', 'w') as fp:
    json.dump(flattenSynsetValues(asyns), fp)

In [104]:
"""
NOTE: DON'T SAVE THIS, IF ALREADY USING SAVED!!!
"""
# save nsimatrix
pickle.dump( nsimatrix, open(root_out+'nsimatrix.p', "wb" ) )


In [105]:
"""
NOTE: DON'T SAVE THIS, ALREADY USING SAVED!!!
"""
# flatten and save nsyns
with open(root_out+'nsyns.json', 'w') as fp:
    json.dump(flattenSynsetValues(nsyns), fp)

##Hypernyms
find the lowest common [hypernym](https://en.wikipedia.org/wiki/Hyponymy_and_hypernymy) between similar

In [106]:
#Quick Test
Synset('dog.n.01').lowest_common_hypernyms(Synset('cat.n.01'))[0]

Synset('carnivore.n.01')

In [107]:
## CORE FUNCTIONS FOR BUILDING HYPERNYM

def makeOrderedTuple(idx1, idx2):
    if idx1 > idx2:
        return (idx2,idx1) 
    return (idx1,idx2) 

def cachedHypernymOrBuild(idx1, idx2, syn_lookup, hypes, hype_as_str=True):
    """
    Build Hypernym for given `idxtuple`, using the `syns_lookup`.
    Facilitate O(n) computational complexity by caching results
    Will internally manage hypernym keys as ordered tuple.
    
    --- Input ---
    idx: tuple of id to build and cache
    syn_lookup: existing dictionary of synsets, with k: id, v: Synset or None    
    hypes: dictionary for hypernyms with k: ordered tuple, v: hypernym.
    hype_as_str: optional build map with string values, default = True
    --- Return ---
    a hypernym Synset or None
    """
    ituple = makeOrderedTuple(idx1,idx2)    
    if ituple in hypes: 
        return hypes[ituple] 
    
    try:    
        s1 = syn_lookup[ituple[0]]
        s2 = syn_lookup[ituple[1]]
        h = s1.lowest_common_hypernyms(s2)[0]
        
        if hype_as_str:
            h = synsetStr(h)
            
        hypes[ituple] = h
        return h
    except Exception:
        hypes[ituple] = None
        return None

def lowestCommonHypernyms(simatrix, syn_lookup, sim_threshold=SIM_THRESHOLD, hype_as_str=True):
    """
    Build a matrix with hypernym where found.
    Optionally, only evaluate values at/above a threshold.
    
    --- Input ---
    simatrix: tuple of id to build and cache
    syn_lookup: existing dictionary of synsets, with k: id, v: Synset or None    
    sim_threshold: optional threshold to use for establishing hypernyms, default = SIM_THRESHOLD
    hype_as_str: optional build map with string values, default = True
    
    --- Return ---
    dictionary for hypernyms with k: ordered tuple, v: Synset.    
    """
    
    hypes = {} # dictionary to build up.
    
    n = len(simatrix)
    ns = range(n)          
    for i in ns:
        for j in ns:
            v = simatrix[i][j] 
            
            # handle sim_threshold
            met_threshold = True
            if sim_threshold and v < sim_threshold:
                met_threshold = False
            elif not v:
                met_threshold = False
                    
            if (i != j) and met_threshold:                                
                cachedHypernymOrBuild(i,j, syn_lookup, hypes, hype_as_str)
                
    return hypes

In [108]:
## FUNCTIONS FOR EVALUATING HYPERNYMS

def countHypernyms(hypes, count_valid=True, count_invalid=True):
    """
    Count  hypernyms, ignoring None
    """
    c = 0
    for k,v in hypes.iteritems():
        if count_valid and v:
            c += 1
        elif count_invalid and not v:
            c += 1        
    return c

###Adjective Hypernyms

In [109]:
# find adj hypernyms, defaulting to only the string value
ahypes = lowestCommonHypernyms(asimatrix, asyns)

In [110]:
# check results
print "how many adj hypernyms? ", countHypernyms(ahypes)
print "how many valid adj hypernyms? ", countHypernyms(ahypes, count_valid=True, count_invalid=False)
print "how many invalid adj hypernyms? ", countHypernyms(ahypes, count_valid=False, count_invalid=True)
print "example key: {}, value: {}".format(ahypes.keys()[0],ahypes[ahypes.keys()[0]])

how many adj hypernyms?  25
how many valid adj hypernyms?  25
how many invalid adj hypernyms?  0
example key: (50, 599), value: small


In [111]:
ahypes

{(12, 217): u'all_right',
 (19, 261): u'red',
 (19, 521): u'red',
 (47, 67): u'bare',
 (50, 599): u'small',
 (74, 442): u'large',
 (177, 631): u'cockamamie',
 (214, 473): u'disgusting',
 (219, 661): u'cheery',
 (232, 912): u'nauseating',
 (239, 884): u'cunning',
 (241, 634): u'bitty',
 (250, 816): u'bogus',
 (253, 388): u'barbarous',
 (253, 637): u'barbarous',
 (261, 521): u'red',
 (273, 314): u'wide',
 (284, 408): u'grey',
 (295, 477): u'frigid',
 (326, 496): u'conceited',
 (388, 637): u'barbarous',
 (399, 545): u'brumous',
 (507, 603): u'fifth',
 (538, 768): u'jammed',
 (642, 756): u'average'}

###Noun Hypernyms

In [112]:
# find noun hypernyms
nhypes = lowestCommonHypernyms(nsimatrix, nsyns)

In [113]:
# check results
print "how many noun hypernyms? ", countHypernyms(nhypes)
print "how many valid noun hypernyms? ", countHypernyms(nhypes, count_valid=True, count_invalid=False)
print "how many invalid noun hypernyms? ", countHypernyms(nhypes, count_valid=False, count_invalid=True)
print "example key: {}, value: {}".format(nhypes.keys()[0],nhypes[nhypes.keys()[0]])

how many noun hypernyms?  51
how many valid noun hypernyms?  51
how many invalid noun hypernyms?  0
example key: (83, 1074), value: dad


In [114]:
"""
NOTE: This won't work quite right unless nsyns is fully processed 
"""
nhypes

{(12, 905): u'adieu',
 (60, 65): u'baby',
 (79, 1014): u'play',
 (83, 293): u'dad',
 (83, 1074): u'dad',
 (83, 1446): u'dad',
 (93, 155): u'topographic_point',
 (102, 971): u'narrative',
 (117, 589): u'person',
 (136, 1192): u'girl',
 (143, 407): u'buttocks',
 (167, 920): u'ma',
 (167, 1125): u'ma',
 (172, 704): u'grandfather',
 (190, 541): u'asshole',
 (190, 1012): u'asshole',
 (196, 682): u'wage',
 (240, 791): u'semen',
 (275, 970): u'hood',
 (279, 338): u'cast',
 (290, 808): u'animal',
 (293, 1074): u'dad',
 (293, 1446): u'dad',
 (294, 529): u'idea',
 (295, 725): u'clasp',
 (364, 385): u'pit',
 (366, 1505): u'breast',
 (368, 1364): u'shooting',
 (396, 878): u'smile',
 (421, 785): u'boom',
 (497, 1251): u'baggage',
 (541, 1012): u'asshole',
 (580, 1426): u'position',
 (606, 983): u'summer',
 (670, 694): u'center',
 (727, 885): u'manner',
 (727, 916): u'manner',
 (736, 746): u'religion',
 (781, 1551): u'check',
 (803, 1518): u'walk',
 (841, 1299): u'internet',
 (885, 916): u'manner',


##Save Hypernyms

In [115]:
# save adj hypernyms
pickle.dump( ahypes, open(root_out+'ahypes.p', "wb" ) )  

In [116]:
"""
NOTE: DON'T SAVE THIS, ALREADY USING SAVED!!!
"""
# save noun hypernyms
pickle.dump( nhypes, open(root_out+'nhypes.p', "wb" ) )  