### Training word2vec model from CVs texts with gensim

#### SETUP

In [1]:
from __future__ import print_function
import sys
import numpy as np
import pandas as pd
import pickle
import gc
import csv
import os

In [2]:
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import *
from gensim.models import Phrases
from gensim.models.phrases import Phraser

import mypykit as mpk

gc.collect()
csv.field_size_limit(sys.maxsize) # some of the text fields can be very big...

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


131072

-------

#### GET TEXTS

In [None]:
#texts_stream = mpk.generator_csv2text("../data/sample_candidates_texts.csv", text_column_index=text_col)

filepath = "../data/sample_candidates_texts.csv"
text_col = 1
custom_filters = [lambda x: x.lower(),
                  strip_tags,
                  strip_punctuation,
                  strip_multiple_whitespaces]

#Word2Vec needs to iterate many times over documents so it does not accept the generator :/ Use CSVIterator
texts_stream = mpk.CSVIterator(filepath, custom_filters, col_index=text_col, cut_char=5000000)

#### PREPROCESING

Testing with sample:

In [None]:
%%time
#texts = [preprocess_string(filters=custom_filters, s=text) for text in texts_stream]
texts = [t for t in texts_stream]
print("Number of documents in the stream:", len(texts))
print("texts object size:", sys.getsizeof(texts)/1e+6, 'megabytes')

In [None]:
%%time
mpk.write_txt("../data/sample_tokenized_texts.txt", texts_stream)

In [None]:
%%time
dictionary = corpora.Dictionary(texts_stream)
dictionary.save('../models/samlple_cvs_dict_lvl_1.dict')
print(dictionary)

#### TRAIN WORD2VEC

In [None]:
import ast
class Txts2Lists:
    def __init__(self, path):
        self.path = path

    def __iter__(self):
        with open(self.path, "r") as f:
            for line in f:
                if len(line) > 2:
                    yield ast.literal_eval(line)
                    #yield line

In [None]:
gc.collect()

In [None]:
%%time
#Word2Vec needs to iterate many times over documents so it does not accept the generator :/ Use iterable
texts_stream = Txts2Lists("../data/all_tokenized_texts.txt")
model = Word2Vec(texts_stream, size=100, window=5, min_count=10, workers=4, iter=5)
print(model)

In [None]:
model.save("../models/cvs_w2v_lvl_1")

In [None]:
embedding_matrix = mpk.w2v_embedding_matrix(model, vector_dim=100)
weights = embedding_matrix
with open('../data/cvs_w2v_weights_lvl_1.pickle', 'wb') as handle:
    pickle.dump(weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

Save embeddings with vocab to csv for R EDA:

In [7]:
np.savetxt('../data/embeddings_lvl3.csv', weights, delimiter=',')   
vocab = [word for word, vocab_obj in model.wv.vocab.items()]
df_vocab = pd.DataFrame({"token": vocab})
df_vocab.to_csv("../data/vocab_lvl3.csv")

In [3]:
model = Word2Vec.load("../models/cvs_w2v_lvl_3")

In [7]:
model.wv.most_similar(positive=['war'])

[('invasion', 0.7615526914596558),
 ('cold_war', 0.7589760422706604),
 ('wars', 0.7434630393981934),
 ('terror', 0.7399106621742249),
 ('civil_war', 0.7309882640838623),
 ('protest', 0.7121016979217529),
 ('arab_spring', 0.7008945345878601),
 ('riots', 0.6994553804397583),
 ('genocide', 0.6945540308952332),
 ('bombing', 0.6943058967590332)]

In [6]:
model.wv.most_similar(positive=["japan", "paris"], negative=["france"])

[('tokyo', 0.8857423067092896),
 ('taipei', 0.780282199382782),
 ('seoul', 0.7592018842697144),
 ('beijing', 0.7475929260253906),
 ('fukuoka', 0.7475562691688538),
 ('osaka', 0.7459617853164673),
 ('hong_kong', 0.7339567542076111),
 ('japan_tokyo', 0.7339449524879456),
 ('tokyo_japan', 0.7141210436820984),
 ('taiwan', 0.7088636159896851)]

#### Adding bigrams with phrases

In [None]:
%%time
texts_stream = Txts2Lists("../data/all_tokenized_texts.txt")
phrases = Phrases(texts_stream, min_count=10)
bigram_transformer = Phraser(phrases)
mpk.write_txt("../data/all_tokenized_texts_lvl2.txt", bigram_transformer[texts_stream])

In [None]:
dict2 = corpora.Dictionary(bigram_transformer[texts_stream])
dict2.save('../models/cvs_dict_lvl_2.dict')

#### Adding trigrams with phrases

In [None]:
%%time
texts_stream = Txts2Lists("../data/all_tokenized_texts_lvl2.txt")
phrases = Phrases(texts_stream, min_count=10)
bigram_transformer = Phraser(phrases)
mpk.write_txt("../data/all_tokenized_texts_lvl3.txt", bigram_transformer[texts_stream])

In [None]:
dict3 = corpora.Dictionary(bigram_transformer[texts_stream])
dict3.save('../models/cvs_dict_lvl_3.dict')

In [None]:
mpk.save_pickle("../data/trigram_phrases", phrases)

#### Retrain Word2Vec including ngrams

In [None]:
model = Word2Vec.load("../models/cvs_w2v_lvl_1")

In [None]:
%%time
print("Updating W2V model with phrases - level 2...")
texts_stream = Txts2Lists("../data/all_tokenized_texts_lvl2.txt")
model.build_vocab(texts_stream, update=True)
model.train(texts_stream, total_examples=model.corpus_count, epochs=4)

In [None]:
model.save("../models/cvs_w2v_lvl_2")

In [None]:
%%time
print("Updating W2V model with phrases - level 3...")
texts_stream = Txts2Lists("../data/all_tokenized_texts_lvl3.txt")
model.build_vocab(texts_stream, update=True)
model.train(texts_stream, total_examples=model.corpus_count, epochs=4)

In [None]:
model.save("../models/cvs_w2v_lvl_3")

In [None]:
#a = model_lvl_3.wv.get_keras_embedding() !!!!

-------

#### EXPLORE VECTORS

In [None]:
def words_algebra(positive=[], negative=[], n=1):
    if len(negative) < 1:
        return model.wv.most_similar(positive=positive, negative=negative, topn=n)
    else:
        return model.wv.most_similar(positive=positive, topn=n)

In [None]:
model.wv.similarity('good', 'great')

In [None]:
model.wv.similarity('good', 'bad')

In [14]:
model.wv.similarity('python', 'perl')

0.7531644973978595

In [15]:
model.wv.similarity('python', 'excel')

0.4378327513749595

In [8]:
model.wv.most_similar(positive=['fb'], topn=10)

[('facebook', 0.7660043835639954),
 ('twitter', 0.7028908133506775),
 ('instagram', 0.6729579567909241),
 ('sns', 0.6694015860557556),
 ('fb_twitter', 0.6657751798629761),
 ('utm_source', 0.6575230956077576),
 ('facebok', 0.6562955975532532),
 ('facbook', 0.6500971913337708),
 ('facebook_twitter_foursquare', 0.6500629186630249),
 ('facebook_twitter_youtube', 0.6386256814002991)]

In [9]:
model.wv.most_similar(positive=['big_data'], negative=['statistics'], topn=10)

[('bigdata', 0.6311461329460144),
 ('next_gen', 0.6159036159515381),
 ('saas_paas_iaas', 0.6142393946647644),
 ('paas_iaas', 0.609619677066803),
 ('predix', 0.607494592666626),
 ('cloud_iaas', 0.601280927658081),
 ('devops', 0.6005290746688843),
 ('nutanix', 0.5990670323371887),
 ('thoughtworks', 0.5987235903739929),
 ('ibm_bluemix', 0.5975252985954285)]

In [10]:
model.wv.most_similar(positive=['back_end'], negative=['server'], topn=1)

[('frontend', 0.4715600311756134)]

In [None]:
model.wv.most_similar(positive = ['china'], topn = 10)

In [None]:
model.wv.most_similar(positive = ['japan', 'berlin'], negative = ['germany'], topn = 10)

In [11]:
model.wv.most_similar(positive = ['google', 'china'], topn = 1)

[('baidu', 0.7313236594200134)]

In [12]:
model.wv.most_similar(positive = ['excel', 'network'], topn = 1)

[('database', 0.6572868227958679)]

In [13]:
model.wv.most_similar(positive = ['excel'], negative=['server'], topn = 1)

[('formulas_vlookups', 0.5358480215072632)]

In [14]:
model.wv.most_similar(positive = ['excel', 'text'], topn = 1)

[('word', 0.7538812756538391)]

In [15]:
model.wv.most_similar(positive = ['excel', 'code'], topn = 1)

[('macros', 0.713035523891449)]

In [16]:
model.wv.most_similar(positive = ['excel', 'google'], topn = 1)

[('google_sheets', 0.8080769181251526)]

In [None]:
model.wv.most_similar(positive = ['big_data', 'server'], topn = 1)

In [17]:
model.wv.most_similar(positive = ['spss', 'programming'], topn = 1)

[('matlab', 0.8165830373764038)]

In [None]:
model.wv.most_similar(positive = ['spss'], negative=['statistics'], topn = 10)

In [None]:
model.wv.most_similar(positive = ['python'], negative=['statistics'], topn = 10)

In [18]:
model.wv.most_similar(positive = ['man', 'success'], topn = 10)

[('strength', 0.5720359683036804),
 ('happiness', 0.5691606402397156),
 ('successful', 0.5664874911308289),
 ('driving_force', 0.553283154964447),
 ('goal', 0.543483555316925),
 ('grit', 0.5434126853942871),
 ('succes', 0.5314896106719971),
 ('greatness', 0.5310738682746887),
 ('feeling', 0.5299428701400757),
 ('woman', 0.5228298902511597)]

In [19]:
model.wv.most_similar(positive = ['woman', 'success'], topn = 10)

[('happiness', 0.6242830753326416),
 ('feeling', 0.6065638661384583),
 ('ambition', 0.6007509231567383),
 ('life', 0.5921899676322937),
 ('entrepreneurial_spirit', 0.5919184684753418),
 ('heart', 0.58869868516922),
 ('driving_force', 0.5848144292831421),
 ('growth', 0.5792322158813477),
 ('goal', 0.5706526637077332),
 ('challenge', 0.5669323801994324)]

-----

#### EXPLORE WITH TENSORBOARD

https://stackoverflow.com/questions/50492676/visualize-gensim-word2vec-embeddings-in-tensorboard-projector/50499090

In [None]:
import tensorflow as tf

In [None]:
emb = tf.Variable(embedding_matrix, name='word_embeddings')

In [None]:
init_op = tf.global_variables_initializer()

In [None]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

In [None]:
with tf.Session() as sess:
    sess.run(init_op)
    save_path = saver.save(sess, "../model_dir/model.ckpt")
    print("Model saved in path: %s" % save_path)

**metadata**

In [None]:
model = model_lvl_3

In [None]:
keys = model.wv.vocab.keys()

In [None]:
words = '\n'.join(list(model.wv.vocab.keys()))

with open(os.path.join('../model_dir', 'metadata.tsv'), 'w') as f:
   f.write(words)

In [None]:
!tensorboard --logdir ../model_dir

localhost:6006