In [91]:
import nltk
import pandas as pd
import numpy as np
import pickle
import pymongo
import string

from collections import defaultdict, Counter

## NLP
from nltk.corpus import treebank, stopwords
from nltk import bigrams, trigrams
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

from textblob import TextBlob

## Database
from pymongo import MongoClient

## Visualization
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline

In [76]:
def pickleme(temp, filename):
    
    with open(filename + '.pkl', 'w') as picklefile:
        pickle.dump(temp, picklefile)
    
    return

In [77]:
def getpickle(filename):

    try:
        with open(filename + '.pkl', 'rb') as picklefile:
            return pickle.load(picklefile)
    except:
        return 'There was an error trying to read this file.  Please check the filename or path.'

In [78]:
def tokenizenstem(temp, stem = True, nouns = False):
    
    global stopwords
    
    temptokens = []
    
    if nouns:
        blob = TextBlob(temp)
        tokens = blob.noun_phrases
    else:
        tokens = nltk.word_tokenize(temp)
    
    ##  Creates instance of WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    for word in tokens:
        if word.lower() not in stopwords and word not in string.punctuation and len(word) > 2:
            if stem:
                temptokens.append(wnl.lemmatize(word.lower()))
            else:
                temptokens.append(word.lower())
    
    return temptokens

In [79]:
def getstopwords():

    STOP_WORDS = list(getpickle('stopwords'))
    stopwords = getpickle('resumestopwords')
    
    stopwords = [item for sublist in [list(STOP_WORDS) + stopwords] for item in sublist]
    
    return stopwords

In [80]:
def addtostoplist(tempword):
    
    stopwords = getpickle('resumestopwords')
    
    stopwords.append(tempword)
    
    stopwords = list(set(stopwords))
    
    pickleme(stopwords, 'resumestopwords')
    
    print tempword + ' added to State of the Union stop word list.'
    
    return

In [81]:
addtostoplist('was')

was added to State of the Union stop word list.


#### WordCloud from amueller/GitHub

* [amueller/GitHub](https://github.com/amueller/word_cloud)

In [82]:
def cloud(tokens, maxwords):
    
    global STOP_WORDS, colors
    
    wordcloud = WordCloud(height = 500, width = 700, 
                          background_color = 'white', mode = 'RGBA', 
                          max_words = maxwords, stopwords = STOP_WORDS,
                          margin = 10, random_state = 3).generate(tokens)
    plt.figure()

    plt.imshow(wordcloud.to_array())
    plt.axis("off")
    plt.show()
    
    return

#### Writes data to JSON file for visualization using mbostock's d3 force directed plot.

In [83]:
def writejson(df, closest, tempname):

    distancedict = defaultdict(list)

    for i in range(1,len(df) + 1):
        temp = df[['index', i]].sort_values(i)[1:2].values
        distancedict[i].append(int(temp[0][0]))
        
    f = open(tempname + '.json', 'w')
    
    f.write('{\n  "nodes":[\n')
    
    for i in range(1,len(df) + 1):
        filename = '0' * (3 - len(str(i))) + str(i)
        if i < len(df):
            f.write('    {\"name\":\"Resume ' + filename + '",\"group\":' + str(distancedict[i][0]) +'},\n')
        else:
            f.write('    {\"name\":\"Resume ' + filename + '",\"group\":' + str(distancedict[i][0]) +'}\n')
    
    f.write('  ],\n  \"links\":[\n')
    
    for i in range(1,len(df) + 1):
        for j in range(closest):
            temp = df[['index', i]].sort_values(i)[1+j:2+j].values
            if i == int(temp[0][0]) or round(temp[0][1],4) == 0:
                j -= 1
                continue
            if i != len(df):
                f.write('    {\"source\":' + str(i-1) + ',\"target\":' + str(int(temp[0][0]) - 1) + ',\"value\":' + str(1) + '},\n')
            else:
                f.write('    {\"source\":' + str(i-1) + ',\"target\":' + str(int(temp[0][0]) - 1) + ',\"value\":' + str(1) + '}\n')
    
    f.write('  ]\n}')
    f.close()
    
    print tempname +' JSON file written.'
    
    return

In [84]:
colors = getpickle('tableaucolors')
stopwords = getstopwords()

### Start-up mongoDB and load resume database

In [85]:
! brew services start mongodb

Service `mongodb` already started, use `brew services restart mongodb` to restart.


In [86]:
# ! brew services stop mongodb

In [87]:
## Initialize MongoDB for use
client = MongoClient()
db = client['resume_db']
resumes = db.resume_collection

In [88]:
print "Count of number of documents:", resumes.count()
print "Keys in one example document:", resumes.find_one().keys()
# resumes.find_one() # Uncomment this line to see what a full document looks like.

Count of number of documents: 73
Keys in one example document: [u'_id', u'Id', u'Resume']


In [89]:
corpus = []
tokens = []

for resume in resumes.find({}, {'Resume' : 1, "_id": 0}):
    corpus.append(resume['Resume'])
    tokens.append(tokenizenstem(resume['Resume'], stem = True, nouns = False))

print len(corpus), len(tokens)

73 73


### Find the Euclidean Distance between Resumes

* [sklearn.feature_extraction.text.TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
* [sklearn.feature_extraction.text.CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
* [sklearn.metrics.pairwise.pairwise_distances](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html)

In [90]:
nouns = False
ngrams = 3

mxdf = 0.45
mindf = 0.05

cvtfidf = [True, False]
metrics = ['euclidean', 'cosine', 'manhattan', 'cityblock', 'l1', 'l2']

for cv in cvtfidf:

    if cv == True:
        vect = CountVectorizer(input = 'content', encoding = 'utf-8', decode_error = 'strict', 
                               strip_accents = None, lowercase=True, preprocessor = None, 
                               tokenizer = tokenizenstem, stop_words = stopwords,
                               token_pattern='(?u)\b\w\w+\b', ngram_range = (1, ngrams), 
                               analyzer = 'word', max_df = mxdf, min_df = mindf, max_features = None,
                               vocabulary = None, binary = False)    
    else:
        vect = TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', 
                               strip_accents=None, lowercase=True, preprocessor=None, 
                               tokenizer = tokenizenstem, analyzer='word', stop_words = stopwords, 
                               token_pattern='(?u)\b\w\w+\b', ngram_range=(1, ngrams), 
                               max_df = mxdf, min_df = mindf, max_features=None, vocabulary=None, 
                               binary=False, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

    wookie = vect.fit_transform(corpus)

    for metric in metrics:

        jsondata = pairwise_distances(wookie, metric = metric)

        df = pd.DataFrame(jsondata, index = np.array(range(1,len(corpus)+1)), 
                          columns = np.array(range(1,len(corpus)+1)))
        
        df['index'] = np.array(range(1,len(corpus)+1))

        if cv == True:
            tempname = metric + 'cv'
        else:
            tempname = metric + 'tfidf'

        writejson(df, 1, tempname)

euclideancv JSON file written.
cosinecv JSON file written.
manhattancv JSON file written.
cityblockcv JSON file written.
l1cv JSON file written.
l2cv JSON file written.
euclideantfidf JSON file written.
cosinetfidf JSON file written.
manhattantfidf JSON file written.
cityblocktfidf JSON file written.
l1tfidf JSON file written.
l2tfidf JSON file written.
