In [1]:
import re
import json
import operator
import numpy as np
from itertools import islice
from bs4 import BeautifulSoup
from collections import Counter
from gensim.models import Word2Vec
from nltk.tokenize import TweetTokenizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import config
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Notes

* [Handling of stopwords when training word2vec](https://stackoverflow.com/questions/34721984/stopword-removing-when-using-the-word2vec). Basically, you don't need to do it in word2vec. But if you decided you want to, follow [this](https://gist.githubusercontent.com/abhishek-shrm/6b3d72988e87ac12e9bb2e9a0715b855/raw/ea4de010b2c6c3691808b3b6ddb45b70270627ec/IR-W2V-19.py)

* [Removing plural,ed, ing](https://www.geeksforgeeks.org/python-lemmatization-with-nltk/)
* [Fixing wrong lemmatization](https://stackoverflow.com/questions/32957895/wordnetlemmatizer-not-returning-the-right-lemma-unless-pos-is-explicit-python)
* [Genism can rapidly build word2vec model](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)
* [Update the vocabs and weights using Genism](https://stackoverflow.com/questions/42357678/gensim-word2vec-array-dimensions-in-updating-with-online-word-embedding)
* [Loading Large corpus to word2vec](https://stackoverflow.com/questions/63459657/how-to-load-large-dataset-to-gensim-word2vec-model)
* [Data: David Copperfield by Charles Dickens from Project Gutenberg](https://www.gutenberg.org/)

In [2]:
# Raw document 
with open('data/corpus.txt') as f:
    doc = f.readlines()
sentences = "".join(doc).split(".")

### Build word2vec using negative sampling and skip-gram based on Genism

In [3]:
# Prepare paragraph and tokens
sentences = doc

# Function for lemmatiziation 
def lemmatizie_sentence(split_sentence):
    clean_sentence = []
    wnl = WordNetLemmatizer()
    for token in split_sentence:
        token = wnl.lemmatize(token)
        token = token.lower()
        token=re.sub('\w*\d\w*','', token)
        token=re.sub('\n',' ',token)
        token=re.sub(r"http\S+", "", token)
        token=re.sub('[^a-z]',' ',token)
        token = re.sub(' +',' ',token)
        token = token.strip()
        clean_sentence.append(token)
    return clean_sentence

#Expand contractions
contractions_dict = config.contractions_dict
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

clean_sentences = []
for sentence in sentences:
    
    split_sentence = sentence.split(" ")

    # Remove empty string
    split_sentence  = list(filter(None, split_sentence ))
    clean_sentence = lemmatizie_sentence(split_sentence)
    clean_sentence = [expand_contractions(i, contractions_dict) for i in clean_sentence]
    if len(clean_sentence) == 0:
        pass
    else:
        clean_sentences.append(clean_sentence)

In [4]:
clean_sentences

[['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'david',
  'copperfield',
  'by',
  'charles',
  'dickens'],
 [''],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restriction',
  'whatsoever',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or'],
 ['re use',
  'it',
  'under',
  'the',
  'term',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'www gutenberg org'],
 [''],
 [''],
 ['title', 'david', 'copperfield'],
 [''],
 ['author', 'charles', 'dickens'],
 [''],
 ['release', 'date', 'december', '', 'etext', ''],
 ['posting', 'date', 'november', '', ''],
 ['last', 'updated', 'september', '', ''],
 [''],
 ['language', 'english'],
 [''],
 ['character', 'set', 'encoding', 'utf'],
 [''],
 ['',
  'start',
  'of',
  'this',
  'project',
  'gutenberg',
  'ebook',
  'david',
  'copper

In [5]:
# train model
vector_size = 200
model = Word2Vec(clean_sentences, vector_size=vector_size, window=5, min_count=1)

# summarize vocabulary
words = list(model.wv.index_to_key )
print("vocab", len(words))

#Word to vec dictionary
word2vec_ = {}
for word in words:
    word2vec_[word] = model.wv.get_vector(word)

# save model
model.save('model/model.bin')

# load model
new_model = Word2Vec.load('model/model.bin')
print(new_model)

vocab 16425
Word2Vec(vocab=16425, vector_size=200, alpha=0.025)


In [6]:
def generate_sentence_embedding(sentence, vector_size=300):
    """
    This is a function to take sentence, retrieve embedding for each words and return average embedding.  
    
    Parameters:
    ------------
        sentence: list
        vector_size: integer
    
    Returns:
    ------------
        vector: numpy array
    
    """
    
    sentence_embedding = []
    if len(sentence) == 0:
        return np.zeros(vector_size)
    else:
        for token in sentence:
            if token in model.wv.index_to_key:
                sentence_embedding.append(model.wv.get_vector(token))
            else:
                sentence_embedding.append(np.random.rand(vector_size))                
        return np.mean(sentence_embedding, axis=0)
    
generate_sentence_embedding(clean_sentences[0], vector_size)

array([ 1.02806330e-01, -1.28444344e-01, -1.95184901e-01,  1.56219557e-01,
        1.60848603e-01, -1.76008463e-01,  1.99486703e-01,  3.66925597e-01,
       -8.88615102e-02, -5.05533293e-02, -3.53404507e-02, -1.06646106e-01,
       -2.48278666e-04,  2.71814317e-01, -9.54377577e-02, -8.51596296e-02,
        1.23054972e-02,  4.50532809e-02, -3.57561335e-02, -3.81331176e-01,
        2.23134786e-01, -5.36635146e-02,  3.03363919e-01, -4.70579341e-02,
        1.28177881e-01,  7.09928339e-03, -7.80028477e-03, -1.71369359e-01,
       -2.21183032e-01, -5.23244441e-02, -7.12723564e-03,  3.51734087e-02,
        3.43220234e-01,  6.71256557e-02, -2.71753930e-02,  1.18188165e-01,
        3.94353628e-01,  8.55819210e-02, -1.26324907e-01, -1.17670655e-01,
       -1.56811789e-01, -4.02623266e-02, -1.36028202e-02, -2.09149215e-02,
        3.44245821e-01, -2.10875608e-02, -1.73863098e-01, -9.54576954e-02,
        2.55705237e-01,  2.39286378e-01, -6.15478754e-02, -2.01969936e-01,
       -2.83654749e-01, -

In [7]:
import pandas as pd
pd.DataFrame(clean_sentences)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,the,project,gutenberg,ebook,of,david,copperfield,by,charles,dickens,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,this,ebook,is,for,the,use,of,anyone,anywhere,at,no,cost,and,with,,,,,
3,almost,no,restriction,whatsoever,you,may,copy,it,give,it,away,or,,,,,,,
4,re use,it,under,the,term,of,the,project,gutenberg,license,included,,,,,,,,
5,with,this,ebook,or,online,at,www gutenberg org,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,
8,title,david,copperfield,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,,,


In [17]:
print(clean_sentences[400])

['thought mr', 'copperfield', 'thought it', 'wa', 'quite', 'a', 'large', 'rookery', 'but']


### Build word2vec from scratch using negative sampling and skip-gram