## Question 5

In [1]:
import pandas as pd
import numpy as np
import re
import math
from sklearn.decomposition import TruncatedSVD
from gensim.models import word2vec

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
import sys
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.parse.malt import MaltParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/mengyuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mengyuan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mengyuan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mengyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

- SVD to obtain word embedding

(a) Parse the reviews in "200Reviews.csv"

In [3]:
q5_rd = pd.read_csv("200Reviews.csv")
q5_df = q5_rd.drop(["Unnamed: 0", "id"], axis=1)
# remove <br /> of raw text
q5_df["review"] =  q5_df["review"].apply(lambda x: re.sub("<br />", " ", x))
# sentence segmentation
q5_df["sentence"] = q5_df["review"].apply(lambda x: nltk.sent_tokenize(x))
# word tokenization
q5_df["word"] = q5_df["sentence"].apply(lambda x: [nltk.word_tokenize(x[k]) for k in range(len(x))])
# parts of speech
q5_df["parts_of_speech"] = q5_df["word"].apply(lambda x: [nltk.pos_tag(x[k]) for k in range(len(x))])

In [4]:
# lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    
    
def transfer(w):
    wordnettag = get_wordnet_pos(w[1])
    if wordnettag == '':
        lemmatizedword = wordnet_lemmatizer.lemmatize(w[0].lower())
    else:
        lemmatizedword = wordnet_lemmatizer.lemmatize(w[0].lower(),pos=wordnettag)
    if w[0].istitle():
        lemmatizedword = lemmatizedword.capitalize()
    elif w[0].upper()==w[0]:
        lemmatizedword = lemmatizedword.upper()
    else:
        lemmatizedword = lemmatizedword
    return lemmatizedword
    
    
wordnet_lemmatizer = WordNetLemmatizer()

q5_df["lemmatization"] = q5_df["parts_of_speech"].apply(lambda x: [[transfer(w) for w in x[k]] for k in range(len(x))])

In [5]:
# remove stop word
stopWords = set(stopwords.words('english'))
q5_df["no_stop_word"] = q5_df["lemmatization"].apply(lambda x: [[w for w in x[k] if w.lower() not in stopWords] for k in range(len(x))])
# filter out not alpha
q5_df["no_stop_word_isalpha"] = q5_df["no_stop_word"].apply(lambda x: [[w for w in x[k] if w.isalpha()] for k in range(len(x))])

In [6]:
q5_df.head()

Unnamed: 0,sentiment,review,sentence,word,parts_of_speech,lemmatization,no_stop_word,no_stop_word_isalpha
0,1,"""With all this stuff going down at the moment ...","[""With all this stuff going down at the moment...","[[``, With, all, this, stuff, going, down, at,...","[[(``, ``), (With, IN), (all, DT), (this, DT),...","[[``, With, all, this, stuff, go, down, at, th...","[[``, stuff, go, moment, MJ, 've, start, liste...","[[stuff, go, moment, MJ, start, listen, music,..."
1,1,"""\""The Classic War of the Worlds\"" by Timothy ...","[""\""The Classic War of the Worlds\"" by Timothy...","[[``, \, '', The, Classic, War, of, the, World...","[[(``, ``), (\, NN), ('', ''), (The, DT), (Cla...","[[``, \, '', The, Classic, War, of, the, World...","[[``, \, '', Classic, War, Worlds\, '', Timoth...","[[Classic, War, Timothy, Hines, entertaining, ..."
2,0,"""The film starts with a manager (Nicholas Bell...","[""The film starts with a manager (Nicholas Bel...","[[``, The, film, starts, with, a, manager, (, ...","[[(``, ``), (The, DT), (film, NN), (starts, VB...","[[``, The, film, start, with, a, manager, (, N...","[[``, film, start, manager, (, Nicholas, Bell,...","[[film, start, manager, Nicholas, Bell, give, ..."
3,0,"""It must be assumed that those who praised thi...","[""It must be assumed that those who praised th...","[[``, It, must, be, assumed, that, those, who,...","[[(``, ``), (It, PRP), (must, MD), (be, VB), (...","[[``, It, must, be, assume, that, those, who, ...","[[``, must, assume, praise, film, (, \, '', gr...","[[must, assume, praise, film, great, filmed, o..."
4,1,"""Superbly trashy and wondrously unpretentious ...","[""Superbly trashy and wondrously unpretentious...","[[``, Superbly, trashy, and, wondrously, unpre...","[[(``, ``), (Superbly, RB), (trashy, JJ), (and...","[[``, Superbly, trashy, and, wondrously, unpre...","[[``, Superbly, trashy, wondrously, unpretenti...","[[Superbly, trashy, wondrously, unpretentious,..."


(b) Create co-occurance matrix for all remaining words

In [7]:
def co_matix(corpus, unique_word, window_left_right):
    res = np.zeros((len(unique_word), len(unique_word)))
    for i in range(len(corpus)):
        if i - window_left_right < 0:
            for j in range(i + window_left_right):
                if i != j:
                    res[unique_word.index(corpus[i])][unique_word.index(corpus[j])] += 1
        elif i + window_left_right > len(corpus)-1:
            for j in range(i - window_left_right, len(corpus)):
                if i != j:
                    res[unique_word.index(corpus[i])][unique_word.index(corpus[j])] += 1
        else:
            for j in range(i - window_left_right, i + window_left_right):
                if i != j:
                    res[unique_word.index(corpus[i])][unique_word.index(corpus[j])] += 1
    return res

In [8]:
# get corpus
corpus = []
for i in q5_df["no_stop_word_isalpha"].values:
    for j in i:
        corpus.extend(j)
        
# corpus

In [9]:
len(corpus)

22174

In [10]:
# unique words of corpus
unique_word = list(set(corpus))
# get co-occurance matrix for all remaining words with window size equals to 5
comatrix = co_matix(corpus, unique_word, 5)

(c) Apply SVD and obtain word embeddings of size 100

In [24]:
n = 100
svd = TruncatedSVD(n_components=n)
U_sigma_trunc = svd.fit_transform(comatrix)

In [25]:
U_sigma_trunc.shape

(6309, 100)

In [11]:
# result very strange if using u[:, [...100]] as word embedding
u, s, vh = np.linalg.svd(comatrix)
u_100 = u[:, [i for i in range(100)]]
print('Word embedding of size 100\n', u[:, [i for i in range(100)]]) # 100-d
print('Variance along multiple dimensions (a.k.a. singular values) are',s)

Word embedding of size 100
 [[-3.28890728e-03  2.40686586e-03  2.55713093e-03 ... -7.42758964e-03
  -2.01360065e-02  1.18918214e-02]
 [-9.34052394e-04 -1.68790609e-03 -1.24143145e-03 ...  8.75562048e-03
  -7.33235196e-03 -7.66339901e-03]
 [-1.96255008e-03 -6.03827509e-03  2.73098486e-03 ... -1.25629275e-02
  -2.27103072e-03 -1.66592747e-02]
 ...
 [-1.00509221e-03 -2.37027010e-03 -2.59928759e-03 ...  8.15989654e-03
   5.03173176e-03  4.34367231e-03]
 [-2.64913717e-04  1.20950646e-04 -2.61093839e-04 ...  3.29193952e-03
   2.44940134e-04 -4.64032485e-05]
 [-7.55955495e-04 -1.08285398e-04 -2.78799052e-03 ... -1.28230287e-03
   6.16774724e-04 -2.72557732e-03]]
Variance along multiple dimensions (a.k.a. singular values) are [3.90681189e+02 1.00904670e+02 9.06125119e+01 ... 1.94520435e-03
 3.20897453e-04 1.93486336e-05]


- Word2vec to obtain word embedding

In [16]:
sentences = []
for i in q5_df["no_stop_word_isalpha"].values:
    for j in i:
        sentences.append(j)

# sentences

In [17]:
# Creating the model and setting values for the various parameters
num_features = 100  # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 5        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "q5_word2vec"
model_path = f"{model_name}"
model.save(model_path)
print("model saved")

Training model....
model saved


In [18]:
model = word2vec.Word2Vec.load(model_path)
model.wv.doesnt_match("there are many average".split()) 

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'many'

In [19]:
model.wv.vocab

{'stuff': <gensim.models.keyedvectors.Vocab at 0x7f8c8a5be9d0>,
 'go': <gensim.models.keyedvectors.Vocab at 0x7f8c6627fc10>,
 'moment': <gensim.models.keyedvectors.Vocab at 0x7f8c6627f690>,
 'MJ': <gensim.models.keyedvectors.Vocab at 0x7f8c6627f750>,
 'start': <gensim.models.keyedvectors.Vocab at 0x7f8c6627f790>,
 'listen': <gensim.models.keyedvectors.Vocab at 0x7f8c6627f850>,
 'music': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbb10>,
 'watch': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbed0>,
 'odd': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbf50>,
 'documentary': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbc90>,
 'Wiz': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbc50>,
 'Moonwalker': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbd90>,
 'Maybe': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbbd0>,
 'want': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbe90>,
 'get': <gensim.models.keyedvectors.Vocab at 0x7f8c5cabbcd0>,
 'certain': <gensim.models.keyedvectors.

- Compare two methods

In [20]:
# calculate similarity for SVD
def most_similar_words(word, topn, word_matrix, word_to_id, id_to_word):
    row = word_to_id[word]
    vec = word_matrix[row,:]
    m = word_matrix
    dot_m_v = m.dot(vec.T) # vector
    dot_m_m = np.sum(m * m, axis=1) # vector
    dot_v_v = vec.dot(vec.T) # float
    sims = dot_m_v / (math.sqrt(dot_v_v) * np.sqrt(dot_m_m))
    
    return [(id_to_word[id], round(sims[id],4)) for id in (-sims).argsort()[1:topn+1]]

In [21]:
# Build unigram <-> index lookup.
word_to_id, id_to_word = {}, {}
for i, x in enumerate(unique_word):
    word_to_id[x] = i
    id_to_word[i] = x

In [24]:
# comparison between two methods
def compare(word, w2v_model, topn, word_matrix, word_to_id, id_to_word):
    print(f"Most similar words to '{word}':")
    print()
    print("SVD: ", most_similar_words(word, 10, word_matrix, word_to_id, id_to_word))
    print()
    print("Word2Vec: ", [(i, round(j,4)) for i, j in model.wv.most_similar(word)])

In [58]:
compare("go", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'go':

SVD:  [('one', 0.7156), ('good', 0.7121), ('make', 0.7085), ('see', 0.6987), ('beginning', 0.6966), ('like', 0.6927), ('watch', 0.6863), ('idea', 0.6841), ('bad', 0.6814), ('really', 0.6789)]

Word2Vec:  [('movie', 0.8942), ('one', 0.8888), ('film', 0.8825), ('get', 0.8754), ('scene', 0.8594), ('take', 0.8564), ('like', 0.8554), ('make', 0.8506), ('even', 0.8486), ('character', 0.8463)]


In [59]:
compare("film", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'film':

SVD:  [('movie', 0.774), ('one', 0.7732), ('good', 0.745), ('bad', 0.742), ('make', 0.7345), ('something', 0.7199), ('understand', 0.7198), ('actually', 0.7191), ('even', 0.7159), ('really', 0.7124)]

Word2Vec:  [('movie', 0.9024), ('go', 0.8825), ('make', 0.8808), ('like', 0.8799), ('scene', 0.8799), ('get', 0.8731), ('one', 0.8729), ('play', 0.8583), ('time', 0.8578), ('actor', 0.8577)]


In [60]:
compare("actor", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'actor':

SVD:  [('good', 0.6758), ('one', 0.6706), ('make', 0.6686), ('film', 0.6627), ('see', 0.6572), ('movie', 0.6559), ('watch', 0.6542), ('idea', 0.6463), ('bad', 0.6452), ('really', 0.6441)]

Word2Vec:  [('film', 0.8577), ('go', 0.8431), ('first', 0.835), ('movie', 0.8342), ('scene', 0.8253), ('one', 0.817), ('take', 0.8119), ('make', 0.8109), ('much', 0.8084), ('get', 0.8079)]


In [61]:
compare("character", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'character':

SVD:  [('one', 0.6141), ('story', 0.6141), ('film', 0.6027), ('like', 0.5901), ('see', 0.5842), ('mean', 0.5836), ('write', 0.5751), ('never', 0.5745), ('really', 0.5739), ('make', 0.573)]

Word2Vec:  [('get', 0.8582), ('much', 0.8535), ('go', 0.8463), ('like', 0.8425), ('way', 0.8294), ('take', 0.8277), ('give', 0.8276), ('look', 0.8272), ('play', 0.8262), ('movie', 0.8245)]


In [62]:
compare("love", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'love':

SVD:  [('fall', 0.6341), ('truly', 0.6212), ('home', 0.6034), ('highly', 0.5986), ('one', 0.5967), ('many', 0.5876), ('One', 0.587), ('whole', 0.5855), ('movie', 0.5793), ('away', 0.5762)]

Word2Vec:  [('movie', 0.802), ('film', 0.7951), ('scene', 0.7766), ('go', 0.7361), ('actor', 0.734), ('like', 0.728), ('get', 0.7257), ('make', 0.7196), ('seem', 0.7172), ('people', 0.7168)]


In [63]:
compare("story", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'story':

SVD:  [('write', 0.7389), ('film', 0.7005), ('stupid', 0.6551), ('overall', 0.6535), ('one', 0.6521), ('make', 0.6489), ('song', 0.6447), ('understand', 0.6349), ('see', 0.63), ('may', 0.6293)]

Word2Vec:  [('take', 0.7848), ('movie', 0.7773), ('one', 0.7772), ('film', 0.7694), ('get', 0.7632), ('play', 0.7629), ('go', 0.7607), ('come', 0.7547), ('character', 0.7527), ('way', 0.7473)]


In [64]:
compare("good", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'good':

SVD:  [('bad', 0.8227), ('make', 0.8078), ('movie', 0.7922), ('one', 0.7799), ('definitely', 0.7726), ('watch', 0.7718), ('see', 0.7716), ('thing', 0.7712), ('overall', 0.7706), ('really', 0.7685)]

Word2Vec:  [('film', 0.8374), ('give', 0.8294), ('movie', 0.8107), ('make', 0.8107), ('even', 0.8035), ('like', 0.8023), ('first', 0.7998), ('go', 0.7988), ('try', 0.798), ('play', 0.797)]


In [65]:
compare("happy", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'happy':

SVD:  [('pacing', 0.7168), ('Man', 0.6879), ('sale', 0.6829), ('MERK', 0.6769), ('research', 0.6709), ('history', 0.6648), ('Moon', 0.6537), ('Branagh', 0.6533), ('Try', 0.651), ('video', 0.6502)]

Word2Vec:  [('Wrestlemania', 0.3519), ('depressive', 0.3431), ('sparse', 0.3208), ('inexplicable', 0.3197), ('channel', 0.3153), ('Ulli', 0.3044), ('joint', 0.2995), ('Amuse', 0.2988), ('flee', 0.2972), ('unexplained', 0.2862)]


In [66]:
compare("Man", model, 10, U_sigma_trunc, word_to_id, id_to_word)

Most similar words to 'Man':

SVD:  [('see', 0.7756), ('favorite', 0.7608), ('watch', 0.7506), ('must', 0.7409), ('Moon', 0.7404), ('superior', 0.7351), ('far', 0.731), ('among', 0.7255), ('numerous', 0.7207), ('five', 0.7195)]

Word2Vec:  [('defeat', 0.3451), ('CID', 0.3369), ('wit', 0.3296), ('Rather', 0.324), ('male', 0.3233), ('futuristic', 0.319), ('enter', 0.3147), ('exact', 0.3086), ('people', 0.3072), ('bring', 0.3043)]


In [32]:
# using u from linalg.svd
compare("film", model, 10, u_100, word_to_id, id_to_word)

Most similar words to 'film':

SVD:  [('Thelma', 0.1579), ('Louise', 0.1561), ('Expressionist', 0.151), ('Josef', 0.1486), ('roller', 0.1386), ('Werching', 0.133), ('Office', 0.13), ('Dalmations', 0.1295), ('Hickox', 0.1148), ('commend', 0.1142)]

Word2Vec:  [('movie', 0.9143), ('go', 0.8918), ('make', 0.8886), ('one', 0.8848), ('character', 0.8837), ('get', 0.8757), ('actor', 0.8722), ('take', 0.8712), ('time', 0.8706), ('scene', 0.8678)]
