In [1]:
import re
import json
import operator
import numpy as np
from itertools import islice
from bs4 import BeautifulSoup
from collections import Counter
from nltk.tokenize import TweetTokenizer
#from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Notes

Handling of stopwords when training word2vec. 
* [stackoverflow](https://stackoverflow.com/questions/34721984/stopword-removing-when-using-the-word2vec)

* [Removing plural,ed, ing](https://www.geeksforgeeks.org/python-lemmatization-with-nltk/)

* [Genism can rapidly build word2vec model](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)
* [Update the vocabs and weights using Genism](https://stackoverflow.com/questions/42357678/gensim-word2vec-array-dimensions-in-updating-with-online-word-embedding)

In [31]:
doc = "This guideline emphasizes considerations of both safety and quality risk management in establishing levels of mutagenic impurities that are expected to pose negligible carcinogenic risk. It outlines recommendations for assessment and control of mutagenic impurities that reside or are reasonably expected to reside in final drug substance or product, taking into consideration the intended conditions of human use."
# Prepare paragraph

sentences = doc.split(".")
tokens =[]
tokens_cleaned = []
for sentence in sentences:
    
    split_sentence = sentence.split(" ")

    # Remove empty string
    split_sentence  = list(filter(None, split_sentence ))
    tokens = tokens + split_sentence

wnl = WordNetLemmatizer()
#ls = LancasterStemmer()
for token in tokens:
    token = wnl.lemmatize(token)
    #token = ls.stem(token)
    token = token.lower()
    token = re.sub(' +',' ',token)
    token = token.strip()
    tokens_cleaned.append(token)

In [32]:
sentences_for_genism = [i.split(" ") for i in sentences]

In [3]:
# Count of words
counts = dict(Counter(tokens_cleaned))
sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
word2index = dict([(my_tuple[0],idx) for idx,my_tuple in enumerate(sorted_counts,1)])
inverted_word2index = {v:k for k,v in word2index.items()}

In [4]:
def get_windows(seq, n=5):
    """
    Parameters:
    ------------
        seq: list
            Sentence as lit of words
        n:integer
            the window size
    yield:
    -----------
        result: generator object
            Sliding windows
        
    """
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def sample_examples(docs,max_window_size,n_windows,do_negs):
    '''generate target,context pairs and negative examples'''
    windows = []
    for i,doc in enumerate(docs):
        window_size = int(np.random.choice(range(1,max_window_size+1),1)) # dynamic window
        to_append = list(get_windows(doc,2*window_size+1))
        to_append = [list((i,) + elt) for elt in to_append] # convert to list to support del and pop
        windows.append(to_append)

    windows = [elt for sublist in windows for elt in sublist] # flatten
    random_idxs = np.random.choice(range(len(windows)),size=n_windows,replace=False)
    windows = [windows[idx] for idx in random_idxs]
    
    if do_negs:
        all_negs = list(np.random.choice(token_ints,size=n_negs*len(windows),p=neg_distr))
        to_return = windows,all_negs
    else:
        to_return = windows

    return to_return

In [29]:
from gensim.models import Word2Vec
# define training data

# train model
model = Word2Vec(sentences_for_genism, min_count=1)

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.index_to_key )
print(words)

# access vector for one word
print(model.wv.get_vector("substance"))

# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=45, vector_size=100, alpha=0.025)
['of', 'in', 'or', 'expected', '', 'are', 'that', 'impurities', 'mutagenic', 'reside', 'to', 'risk', 'and', 'management', 'pose', 'quality', 'establishing', 'levels', 'safety', 'both', 'considerations', 'emphasizes', 'guideline', 'use', 'carcinogenic', 'negligible', 'human', 'conditions', 'intended', 'the', 'consideration', 'into', 'taking', 'product,', 'substance', 'drug', 'final', 'reasonably', 'control', 'assessment', 'for', 'recommendations', 'outlines', 'It', 'This']
[-1.5137838e-03 -4.0351679e-03 -4.3876749e-03 -4.6231006e-03
 -5.5938833e-03 -5.3086961e-03 -8.0218911e-03  9.5268693e-03
  6.3952962e-03 -3.6234658e-03  2.4789372e-03 -7.6554990e-03
  7.5364923e-03  8.3134081e-03  7.9721858e-04 -6.8381261e-03
 -2.9402738e-03  4.7376757e-03 -2.9422196e-03  3.1599402e-03
  9.4145061e-03  4.3545435e-03 -5.1368116e-03  5.4527549e-03
 -2.8703287e-03 -6.3819331e-03  6.9985930e-03 -9.2326459e-03
 -1.1393481e-03 -1.3735906e-03 -8.4067304e-03 -

In [70]:
model.wv.key_to_index

{'of': 0,
 'in': 1,
 'or': 2,
 'expected': 3,
 '': 4,
 'are': 5,
 'that': 6,
 'impurities': 7,
 'mutagenic': 8,
 'reside': 9,
 'to': 10,
 'risk': 11,
 'and': 12,
 'management': 13,
 'pose': 14,
 'quality': 15,
 'establishing': 16,
 'levels': 17,
 'safety': 18,
 'both': 19,
 'considerations': 20,
 'emphasizes': 21,
 'guideline': 22,
 'use': 23,
 'carcinogenic': 24,
 'negligible': 25,
 'human': 26,
 'conditions': 27,
 'intended': 28,
 'the': 29,
 'consideration': 30,
 'into': 31,
 'taking': 32,
 'product,': 33,
 'substance': 34,
 'drug': 35,
 'final': 36,
 'reasonably': 37,
 'control': 38,
 'assessment': 39,
 'for': 40,
 'recommendations': 41,
 'outlines': 42,
 'It': 43,
 'This': 44}