In [25]:
import re
import json
import operator
import numpy as np
from itertools import islice
from bs4 import BeautifulSoup
from collections import Counter
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Notes

* [Handling of stopwords when training word2vec](https://stackoverflow.com/questions/34721984/stopword-removing-when-using-the-word2vec)

* [Removing plural,ed, ing](https://www.geeksforgeeks.org/python-lemmatization-with-nltk/)

* [Genism can rapidly build word2vec model](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)
* [Update the vocabs and weights using Genism](https://stackoverflow.com/questions/42357678/gensim-word2vec-array-dimensions-in-updating-with-online-word-embedding)
* [Loading Large corpus to word2vec](https://stackoverflow.com/questions/63459657/how-to-load-large-dataset-to-gensim-word2vec-model)

### Build word2vec using negative sampling and skip-gram based on Genism

In [23]:
# Raw document 
doc = "This guideline emphasizes considerations of both safety and quality risk management in establishing levels of mutagenic impurities that are expected to pose negligible carcinogenic risk. It outlines recommendations for assessment and control of mutagenic impurities that reside or are reasonably expected to reside in final drug substance or product, taking into consideration the intended conditions of human use."

# Prepare paragraph and tokens
sentences = doc.split(".")

# Lemmatiziation 
def lemmatizie_sentence(split_sentence):
    clean_sentence = []
    wnl = WordNetLemmatizer()
    for token in split_sentence:
        token = wnl.lemmatize(token)
        token = token.lower()
        token = re.sub(' +',' ',token)
        token = token.strip()
        clean_sentence.append(token)
    return clean_sentence

clean_sentences = []
for sentence in sentences:
    
    split_sentence = sentence.split(" ")

    # Remove empty string
    split_sentence  = list(filter(None, split_sentence ))
    clean_sentence = lemmatizie_sentence(split_sentence)
    if len(clean_sentence) == 0:
        pass
    else: clean_sentences.append(clean_sentence)

In [28]:
# train model
model = Word2Vec(clean_sentences, min_count=1)

# summarize the loaded model
model

<gensim.models.word2vec.Word2Vec at 0x7f25fa52d610>

In [29]:
# summarize vocabulary
words = list(model.wv.index_to_key )
words

['of',
 'expected',
 'are',
 'that',
 'consideration',
 'impurity',
 'mutagenic',
 'and',
 'risk',
 'to',
 'or',
 'reside',
 'in',
 'establishing',
 'level',
 'use',
 'quality',
 'safety',
 'both',
 'emphasizes',
 'guideline',
 'management',
 'negligible',
 'pose',
 'human',
 'condition',
 'intended',
 'the',
 'into',
 'taking',
 'product,',
 'substance',
 'drug',
 'final',
 'reasonably',
 'control',
 'assessment',
 'for',
 'recommendation',
 'outline',
 'it',
 'carcinogenic',
 'this']

In [34]:
#Word to vec dictionary
word2vec_ = {}
for word in words:
    word2vec_[word] = model.wv.get_vector(word)
word2vec_

{'of': array([-5.3123105e-04,  2.3834883e-04,  5.1217917e-03,  9.0283453e-03,
        -9.3221292e-03, -7.1413638e-03,  6.4794831e-03,  8.9952862e-03,
        -5.0259517e-03, -3.8203108e-03,  7.3947948e-03, -1.5349931e-03,
        -4.5430562e-03,  6.5427669e-03, -4.8491773e-03, -1.8086353e-03,
         2.9007718e-03,  9.7424659e-04, -8.2994383e-03, -9.4806748e-03,
         7.3140739e-03,  5.0963294e-03,  6.7813708e-03,  7.5563055e-04,
         6.3404315e-03, -3.3886656e-03, -9.5903676e-04,  5.7959720e-03,
        -7.5399699e-03, -3.9044118e-03, -7.5171725e-03, -9.3808764e-04,
         9.5395623e-03, -7.3654409e-03, -2.3430411e-03, -1.9259612e-03,
         8.0755660e-03, -5.9407139e-03,  2.1596634e-05, -4.7397390e-03,
        -9.5842201e-03,  5.0038607e-03, -8.7780626e-03, -4.3930365e-03,
        -6.7269498e-06, -3.1447902e-04, -7.6715280e-03,  9.6118404e-03,
         5.0055417e-03,  9.2519466e-03, -8.1617599e-03,  4.4922782e-03,
        -4.1434523e-03,  8.0579537e-04,  8.4930751e-03, -4

In [26]:
# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=43, vector_size=100, alpha=0.025)
['of', 'expected', 'are', 'that', 'consideration', 'impurity', 'mutagenic', 'and', 'risk', 'to', 'or', 'reside', 'in', 'establishing', 'level', 'use', 'quality', 'safety', 'both', 'emphasizes', 'guideline', 'management', 'negligible', 'pose', 'human', 'condition', 'intended', 'the', 'into', 'taking', 'product,', 'substance', 'drug', 'final', 'reasonably', 'control', 'assessment', 'for', 'recommendation', 'outline', 'it', 'carcinogenic', 'this']
[-0.0021954  -0.00970765  0.00929529  0.00203197 -0.00116118 -0.00550371
 -0.00850983 -0.00990348  0.00894438 -0.00249943  0.00459238 -0.00451736
  0.00995806  0.00365472  0.00102569 -0.0040441   0.00121062 -0.0026468
  0.00735074  0.00447825  0.00098668  0.00348309  0.00371435 -0.00678538
  0.00893076  0.00173623 -0.00578747  0.00865668 -0.00129148  0.00818704
 -0.00150687  0.00698765  0.0027264  -0.00435973 -0.00374793  0.00919178
  0.00159187 -0.00600701  0.00034962 -0.0019624   0.00158453 -0.0

### Build word2vec from scratch using negative sampling and skip-gram

In [3]:
# Count of words
counts = dict(Counter(tokens_cleaned))
sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
word2index = dict([(my_tuple[0],idx) for idx,my_tuple in enumerate(sorted_counts,1)])
inverted_word2index = {v:k for k,v in word2index.items()}

In [4]:
def get_windows(seq, n=5):
    """
    Parameters:
    ------------
        seq: list
            Sentence as lit of words
        n:integer
            the window size
    yield:
    -----------
        result: generator object
            Sliding windows
        
    """
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def sample_examples(docs,max_window_size,n_windows,do_negs):
    '''generate target,context pairs and negative examples'''
    windows = []
    for i,doc in enumerate(docs):
        window_size = int(np.random.choice(range(1,max_window_size+1),1)) # dynamic window
        to_append = list(get_windows(doc,2*window_size+1))
        to_append = [list((i,) + elt) for elt in to_append] # convert to list to support del and pop
        windows.append(to_append)

    windows = [elt for sublist in windows for elt in sublist] # flatten
    random_idxs = np.random.choice(range(len(windows)),size=n_windows,replace=False)
    windows = [windows[idx] for idx in random_idxs]
    
    if do_negs:
        all_negs = list(np.random.choice(token_ints,size=n_negs*len(windows),p=neg_distr))
        to_return = windows,all_negs
    else:
        to_return = windows

    return to_return