Custom training for Word2Vec : 
word2vec is nothing but a static embedding technique.

In [2]:
# importing libraries
import pandas as pd
import numpy as np
import os
import gensim
import nltk

import warnings
warnings.filterwarnings('ignore')

In [None]:
# downloading the all files from the nltk
nltk.download('all')

In [None]:
# importing sent_tokenizer from nltk and
# importing simple_preprocess from gensim.utils
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
# creating one empty list for storing all the sentence_tokens that will be collected from the dataset
story=[]

In [None]:
for file_name in os.listdir('data'):
  file_path=os.path.join('data',file_name)
  with open(file_path,encoding='unicode_escape') as f:
    corpus=f.read()
  raw_sent=sent_tokenize(corpus)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [None]:
# showing the data through story list
story[:2]
len(story)

In [None]:
# now we are importing Word2Vec() class from gensim.models
from gensim.models import Word2Vec

In [None]:
# model creation of Word2Vec() function and passing few arguments
model=Word2Vec(
    window=10,
    min_count=5,
    vector_size=150
)

In [None]:
# now we will call the model and passing the whole dataset that
# are stored into the story list
# build_vocab() function creates unique vocabulary
model.build_vocab(story)

In [None]:
# showing few attribute for the word2vec model
model.corpus_count
model.epochs
model.corpus_total_words

In [None]:
# now we will train our model
model.train(
    story,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

In [None]:
# now we will check the vector of the word 'king' through this model
model.wv['king']

# checking the dimension of the king's vector
len(model.wv['king'])

In [None]:
# finding out the most similar word of 'king' through most_similar() function
model.wv.most_similar('king')
model.wv.most_similar('daenerys')
model.wv.most_similar('queen')

In [None]:
# using of doesnt_match() function
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])
model.wv.doesnt_match(['cersei','jaime','bronn','tyrion'])

In [None]:
# showing all the vectors in the form of Array
model.wv.get_normed_vectors()

# length of the normed vectors
len(model.wv.get_normed_vectors())

# shape of the vector
model.wv.get_normed_vectors().shape

# all the words of the vector
model.wv.index_to_key

# no of words for training
len(model.wv.index_to_key)

In [None]:
# saving the model as .model extension
model.save('/content/word2vec.model')

# saving the model as .bin extension
model.save('/content/word2vec.bin')

# saving the model as .bin.gzip extension
! gzip word2vec.bin > word2vec.bin.gz