# Word2Vec

## A natural language processing model with Tensorflow

In [1]:
import math
import itertools
from time import time
import multiprocessing

# File management
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from os import path, remove

# NLP
from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus
import spacy
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/eem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Downloading the corpus

In [2]:
corpus_uri = 'http://mattmahoney.net/dc/text8.zip'
target_name = './corpus.txt'
corpus_language = 'english'

if not path.exists(target_name):
    try:
        resp = urlopen(corpus_uri)
        file = ZipFile(BytesIO(resp.read()))

        target_file = open(target_name, 'w')
        for line in file.open(file.namelist()[0]).readlines():
            target_file.write(line.decode('utf-8'))
        target_file.close()
    except:
        if path.exists(target_name):
            remove(target_name)
            
corpus = open(target_name, 'r')
content = corpus.read()
print(content[:1024])

 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic instituti

### Pre processing

In [3]:
# Split text into sentences
stopwords = nltk.corpus.stopwords.words('english')
sentences = list(itertools.islice(Text8Corpus(target_name),None))
sentences = [[word for word in sentence if word not in stopwords] for sentence in sentences]

### Creating and training the language model 

In [9]:
def build_vocabulary(model, sentences):
    t = time()
    model.build_vocab(sentences, progress_per=10000)
    print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [10]:
def train(model, sentences, epochs, corpus_size):
    t = time()
    model.train(sentences, total_examples=corpus_size, epochs=epochs, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [22]:
def build_model(sentences, min_count, window, vector_size, alpha, epochs, sg, corpus_size, model_name):
    cores = multiprocessing.cpu_count()
    model = Word2Vec(min_count=min_count, window=window, vector_size=vector_size, alpha=0.001, workers=cores-1, sg=sg)
    corpus_size = model.corpus_count if corpus_size == 0 else corpus_size
    
    build_vocabulary(model, sentences)
    train(model, sentences, epochs, corpus_size)
    
    model.save(model_name)
    
def get_model_name(sg, window, vector_size, epochs, corpus_size):
    return "./models/{sg}-{window}-{vector_size}-{epochs}-{corpus_size}.model".format(
        sg = 'skipgram' if sg == 1 else 'cbow',
        window = window,
        vector_size = vector_size,
        epochs = epochs,
        corpus_size = corpus_size
    )
    
def build_if_not_exists(sentences, sg=1, window=2, vector_size=100, epochs=30, corpus_size=0):
    model_name = get_model_name(sg, window, vector_size, epochs, corpus_size)
    
    if path.isfile(model_name):
        print('model {} already exists'.format(model_name))
    else:
        return build_model(
            sentences = sentences,
            min_count = 10,
            window = window,
            vector_size = vector_size,
            alpha = 0.001,
            epochs = epochs,
            sg = sg,
            corpus_size = corpus_size,
            model_name = model_name
        )

In [23]:
vector_sizes = [50, 100, 300]
windows = [2, 5, 10]
corpus_sizes = [math.floor(len(sentences)*.33), math.floor(len(sentences)*.66), len(sentences)]
epochs = [10, 20, 30]

params = [list(i) for i in itertools.product(windows, vector_sizes, epochs, corpus_sizes)]

#### Skipgram

In [24]:
for param in params:
    build_if_not_exists(sentences, 1, param[0], param[1], param[2], param[3])

model ./models/skipgram-2-50-10-561.model already exists
model ./models/skipgram-2-50-10-1122.model already exists
model ./models/skipgram-2-50-10-1701.model already exists
model ./models/skipgram-2-50-20-561.model already exists
model ./models/skipgram-2-50-20-1122.model already exists
model ./models/skipgram-2-50-20-1701.model already exists
model ./models/skipgram-2-50-30-561.model already exists
model ./models/skipgram-2-50-30-1122.model already exists
model ./models/skipgram-2-50-30-1701.model already exists
model ./models/skipgram-2-100-10-561.model already exists
model ./models/skipgram-2-100-10-1122.model already exists
model ./models/skipgram-2-100-10-1701.model already exists
model ./models/skipgram-2-100-20-561.model already exists
model ./models/skipgram-2-100-20-1122.model already exists
model ./models/skipgram-2-100-20-1701.model already exists
model ./models/skipgram-2-100-30-561.model already exists
model ./models/skipgram-2-100-30-1122.model already exists
model ./mode

#### Cbow

In [25]:
for param in params:
    build_if_not_exists(sentences, 0, param[0], param[1], param[2], param[3])

model ./models/cbow-2-50-10-561.model already exists
model ./models/cbow-2-50-10-1122.model already exists
model ./models/cbow-2-50-10-1701.model already exists
model ./models/cbow-2-50-20-561.model already exists
model ./models/cbow-2-50-20-1122.model already exists
model ./models/cbow-2-50-20-1701.model already exists
model ./models/cbow-2-50-30-561.model already exists
model ./models/cbow-2-50-30-1122.model already exists
model ./models/cbow-2-50-30-1701.model already exists
model ./models/cbow-2-100-10-561.model already exists
model ./models/cbow-2-100-10-1122.model already exists
model ./models/cbow-2-100-10-1701.model already exists
Time to build vocab: 0.05 mins
Time to train the model: 1.47 mins
Time to build vocab: 0.05 mins
Time to train the model: 1.44 mins
Time to build vocab: 0.05 mins
Time to train the model: 1.4 mins
Time to build vocab: 0.05 mins
Time to train the model: 2.17 mins
Time to build vocab: 0.05 mins
Time to train the model: 2.16 mins
Time to build vocab: 0.0

### Retrieving a model

In [33]:
def get_model(corpus_size, sg=1, window=2, vector_size=100, epochs=30):
    model_name = get_model_name(sg, window, vector_size, epochs, corpus_size)
    
    if path.isfile(model_name):
        return Word2Vec.load(model_name).wv
    else:
        print('Model not trained')

### Analogies

In [105]:
def analogy(model, word, is_to, as_word):
    result = model.most_similar(negative=[word], positive=[is_to, as_word])
    return result[0][0]

In [154]:
model = get_model(corpus_size=len(sentences), sg=1, window=5, vector_size=200, epochs=20)

In [165]:
analogy(model, word='germany', is_to='berlin', as_word='canada')

'ontario'

In [156]:
model.most_similar(positive=['moon'])

[('sun', 0.8934993147850037),
 ('planet', 0.8837549686431885),
 ('lunar', 0.8803530931472778),
 ('jupiter', 0.8790703415870667),
 ('venus', 0.8754491209983826),
 ('orbit', 0.8723046779632568),
 ('eclipse', 0.8718109130859375),
 ('moons', 0.8517736792564392),
 ('planets', 0.8502889275550842),
 ('sky', 0.8460605144500732)]