# 04 - Gensim Word2Vec

This ipython notebook evaluates the Gensim Word2Vec model.  Word2Vec would have been an interesting method to utilize in evaluating and clustering resumes, but the corpus for this study was of insufficient size.

In [1]:
import nltk
import pandas as pd
import numpy as np
import pickle
import pymongo
import seaborn as sns
import string

## NLP
from nltk.corpus import treebank, stopwords
from nltk import bigrams, trigrams
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim
from gensim import corpora, models, similarities

## Database
from pymongo import MongoClient

In [2]:
def pickleme(temp, filename):
    
    with open(filename + '.pkl', 'w') as picklefile:
        pickle.dump(temp, picklefile)
    
    return

In [3]:
def getpickle(filename):

    try:
        with open(filename + '.pkl', 'rb') as picklefile:
            return pickle.load(picklefile)
    except:
        return 'There was an error trying to read this file.  Please check the filename or path.'

In [4]:
def tokenizenstem(temp, stem = True, nouns = False):
    
    global stopwords
    
    temptokens = []
    
    if nouns:
        blob = TextBlob(temp)
        tokens = blob.noun_phrases
    else:
        tokens = nltk.word_tokenize(temp)
    
    ##  Creates instance of WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    for word in tokens:
        if word.lower() not in stopwords and word not in string.punctuation and len(word) > 2:
            if stem:
                temptokens.append(wnl.lemmatize(word.lower()))
            else:
                temptokens.append(word.lower())
    
    return temptokens

In [5]:
def getstopwords():

    STOP_WORDS = list(getpickle('stopwords'))
    stopwords = getpickle('resumestopwords')
    
    stopwords = [item for sublist in [list(STOP_WORDS) + stopwords] for item in sublist]
    
    return stopwords

In [6]:
def addtostoplist(tempword):
    
    stopwords = getpickle('resumestopwords')
    
    stopwords.append(tempword)
    
    stopwords = list(set(stopwords))
    
    pickleme(stopwords, 'resumestopwords')
    
    print tempword + ' added to State of the Union stop word list.'
    
    return

In [7]:
colors = getpickle('tableaucolors')
stopwords = getstopwords()

In [8]:
! brew services start mongodb

Service `mongodb` already started, use `brew services restart mongodb` to restart.


In [9]:
# ! brew services stop mongodb

In [10]:
## Initialize MongoDB for use
client = MongoClient()
db = client['resume_db']
resumes = db.resume_collection

In [11]:
print "Count of number of documents:", resumes.count()
print "Keys in one example document:", resumes.find_one().keys()
# resumes.find_one() # Uncomment this line to see what a full document looks like.

Count of number of documents: 73
Keys in one example document: [u'_id', u'Id', u'Resume']


In [12]:
corpus = []
tokens = []

for resume in resumes.find({}, {'Resume' : 1, "_id": 0}):
    corpus.append(resume['Resume'])
    tokens.append(tokenizenstem(resume['Resume'], stem = True, nouns = False))
    
print len(corpus), len(tokens)

73 73


## Gensim WORD2VEC

In [13]:
vect = CountVectorizer(min_df=1, ngram_range=(1, 3), stop_words = stopwords, 
                       analyzer = 'word', tokenizer = tokenizenstem, strip_accents = None)

vect.fit(corpus)
x = vect.transform(corpus)
x_back = x.toarray()

In [14]:
features = pd.DataFrame(x_back, columns=vect.get_feature_names())

In [15]:
model = gensim.models.Word2Vec(tokens, size=100, window=5, min_count=1, workers=4,sg=1)



In [16]:
print model['military']

[ 0.15180503  0.07556337 -0.05489494 -0.44112751 -0.14328073 -0.02504365
  0.14446692  0.19581757  0.17993115 -0.19747534  0.34007633  0.03845399
  0.23336329  0.14575806  0.05106496  0.11972934  0.11491957  0.01751838
 -0.19310498 -0.39826015  0.02164103 -0.30877906  0.19131209 -0.05012842
  0.12639877  0.11491076 -0.43837157 -0.04495127 -0.03541079 -0.1174932
 -0.21457539 -0.18045032 -0.13747507  0.1743906  -0.04182204  0.01783394
 -0.10643984  0.09667073  0.17975636 -0.17835863 -0.00385763  0.29290849
 -0.33673871  0.26445672 -0.1941783  -0.22924146 -0.3450107   0.0698831
 -0.26027599 -0.03138917  0.11805285  0.0379095   0.30093375  0.17852123
  0.18834978  0.03420704 -0.08233708  0.01690477 -0.06490424  0.09191529
 -0.14534323 -0.55950749  0.08534367 -0.12314319  0.01467513  0.06861869
 -0.24395157 -0.19296676  0.07391211 -0.38301036  0.15676999 -0.35816967
 -0.33674219  0.02628455  0.11333016  0.17183445  0.39647102  0.16004615
 -0.3437691   0.28943467  0.091345   -0.2048196  -0.0

In [17]:
model.most_similar(positive=['command'] ,topn=5)

[(u'staff', 0.9861191511154175),
 (u'chief', 0.9860691428184509),
 (u'joint', 0.9857847690582275),
 (u'liaison', 0.984187126159668),
 (u'division', 0.9793525338172913)]