Custom training for Word2Vec : 
word2vec is nothing but a static embedding technique.

In [2]:
# importing libraries
import pandas as pd
import numpy as np
import os
import gensim
import nltk

import warnings
warnings.filterwarnings('ignore')

In [4]:
# downloading the all files from the nltk
# nltk.download('all')
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# importing sent_tokenizer from nltk and
# importing simple_preprocess from gensim.utils
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [6]:
# creating one empty list for storing all the sentence_tokens that will be collected from the dataset
story=[]

In [7]:
for file_name in os.listdir('data'):
  file_path=os.path.join('data',file_name)
  with open(file_path,encoding='unicode_escape') as f:
    corpus=f.read()
  raw_sent=sent_tokenize(corpus)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [8]:
# showing the data through story list
story[:2]
len(story)

141218

In [9]:
# now we are importing Word2Vec() class from gensim.models
from gensim.models import Word2Vec

In [10]:
# model creation of Word2Vec() function and passing few arguments
model=Word2Vec(
    window=10,
    min_count=5,
    vector_size=150
)

In [11]:
# now we will call the model and passing the whole dataset that
# are stored into the story list
# build_vocab() function creates unique vocabulary
model.build_vocab(story)

In [12]:
# showing few attribute for the word2vec model
model.corpus_count
model.epochs
model.corpus_total_words

1725638

In [13]:
# now we will train our model
model.train(
    story,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

(6482182, 8628190)

In [14]:
# now we will check the vector of the word 'king' through this model
model.wv['king']

# checking the dimension of the king's vector
len(model.wv['king'])

150

In [15]:
# finding out the most similar word of 'king' through most_similar() function
model.wv.most_similar('king')
model.wv.most_similar('daenerys')
model.wv.most_similar('queen')

[('princess', 0.736301839351654),
 ('margaery', 0.7111654877662659),
 ('daenerys', 0.6761711239814758),
 ('cersei', 0.6538811326026917),
 ('joffrey', 0.6476494669914246),
 ('mother', 0.6363057494163513),
 ('prince', 0.6259520649909973),
 ('myrcella', 0.6250472068786621),
 ('stormborn', 0.6177440881729126),
 ('elia', 0.5968142747879028)]

In [16]:
# using of doesnt_match() function
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])
model.wv.doesnt_match(['cersei','jaime','bronn','tyrion'])

'bronn'

In [17]:
# showing all the vectors in the form of Array
model.wv.get_normed_vectors()

# length of the normed vectors
len(model.wv.get_normed_vectors())

# shape of the vector
model.wv.get_normed_vectors().shape

# all the words of the vector
model.wv.index_to_key

# no of words for training
len(model.wv.index_to_key)

11760

In [None]:
# saving the model as .model extension
model.save('word2vec.model')

# saving the model as .bin extension
model.save('word2vec.bin')