In [1]:
import gensim
import logging
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# use gensim==3.8, there is a problem when using intersect_word2vec_format in gensim==4.0 

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

ModuleNotFoundError: No module named 'gensim'

TODO:
- fine tuning should be on the whole corpus, i.e. using both articles and comments
- should we remove stop words?

In [None]:
data_file = "../processed/comments.pkl"
df = pd.read_pickle(data_file)
df.head()

In [None]:
sentences = df['comment'].astype('str').tolist()
print(len(sentences))
print(sentences[0])
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [s.lower() for s in sentences]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

In [None]:
for i in range(5):
    print(sentences_tokenized[i])

In [None]:
%%time
# processing with stop words and lemmatizing
lemmatize = True
remove_stop_word = False
if lemmatize:
    lemmatizer = WordNetLemmatizer()
if remove_stop_word:
    stop_words = set(stopwords.words('english'))
else:
    stop_words = None
sentences_step2 = []
for tokens in sentences_tokenized:
    if stop_words:
        tokens = [token for token in tokens if token not in stop_words]
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    sentences_step2.append(tokens)
print(sentences_step2[0:5])

In [None]:
model = Word2Vec(size=300, min_count=5)
# model.build_vocab(sentences_tokenized)
model.build_vocab(sentences_step2)
total_examples = model.corpus_count

In [None]:
# print example
print(model.wv.vectors.shape)   # model.wv.vectors == model.wv.syn0
# print(model.wv['organic'])
# print(model.wv.index2word[100], model.wv.vectors[100])
# before_example = model.wv['why']
# print(before_example)

# (59861, 300) for min_count=1
# (19160, 300) for min_count=5
# (16675, 300) for min_count=5 after stop_words and lemmatize
# (16803, 300) for min_count=5 after lemmatize

In [None]:
print(model.wv.vocab['organic'], model.wv.vocab['organic'].index)
print(model.wv.index2word[15])
print((model.wv.word_vec('organic') == model.wv.vectors[15]).all())
print((model.wv.get_vector('organic') == model.wv.word_vec('organic')).all())
print(list(model.wv.vocab)[:5])


In [None]:
# PCA should be run on the whole corpus?
def plot_pca_samples(model, word_list, fname=None):
    '''
    model: Word2Vec
    word_list: a list of words
    '''
    plt.figure(figsize=(15,15))
    idx = [model.wv.vocab[word].index for word in word_list]
    # X = model.wv[word_list]
    X = model.wv.vectors
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)
    result = result[idx]
    plt.scatter(result[:, 0], result[0:, 1])
    for i, word in enumerate(word_list):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))
    if fname:
        plt.savefig(fname)
    plt.show()

In [None]:
# plot after initialization
draw_vocab = list(model.wv.vocab)[50:100]
print(draw_vocab)
plot_pca_samples(model, draw_vocab)

In [None]:
%%time
# model_pretrained = KeyedVectors.load_word2vec_format("../wv/GoogleNews-vectors-negative300.bin.gz", binary=True)
# model.build_vocab([list(model_pretrained.vocab.keys())], update=True)     # why size of vocab remains the same?
model.intersect_word2vec_format('../wv/GoogleNews-vectors-negative300.bin.gz', binary=True, lockf=1.0)
# pre-trained word2vec models can be downloaded from https://code.google.com/archive/p/word2vec/

In [None]:
# see the difference between pre-trained model and initialization
plot_pca_samples(model, draw_vocab)

In [None]:
# TODO: use sentences_tokenized or sentences_step2 ???
# model.train(sentences_tokenized, total_examples=total_examples, epochs=5)
model.train(sentences_step2, total_examples=total_examples, epochs=20)

In [None]:
# see the difference between pre-trained and fine-tuning 
plot_pca_samples(model, draw_vocab, 'train_raw.png')

In [None]:
# save model, change the file names to indicate specific models
'''
_comment: text from comment
_raw: not trained on the corpus, pre-trained model
_trained: trained on the corpus
_stopword: without stopword
'''
# model.save("../wv/w2v_comment_raw.model")
model.save("../wv/w2v_comment_trained.model")
# model.save("../wv/w2v_comment_stopword_raw.model")
# model.save("../wv/w2v_comment_stopword_trained.model")