# Word vector and PCA analysis of tokens

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('pymodules')
# Gensim
from gensim.models import Word2Vec
# making the plot look good ...
from adjustText import adjust_text
from sklearn.decomposition import PCA
# this class read the raw input and tokenizes comprehensively for use with modeling
import pymodules.read_and_tokenize as contacts_utils




### Read file and preprocess to generate tokens

In [2]:
filename = "data/Master-data_Q42021.xlsx"
prep_comments, df = contacts_utils.read_file(filename)

Read sheet 'Scrubbed_data' ...
Columns:Index(['TITLE', 'COMMENTS', 'OVERALL_RATING', 'COMFORT_RATING',
       'VISION_RATING', 'VALUE_FOR_MONEY', 'AUTHOR', 'PROS', 'CONS',
       'ORIGINAL_SOURCE', 'REPLY_FROM_ACCUVUE', 'FINAL_PRODUCT_NAME',
       'PRODUCT_LINK', 'WEBSITE', 'RATING', 'PRODUCT', 'BRAND'],
      dtype='object')
Columns dropped: ['OVERALL_RATING', 'COMFORT_RATING', 'VISION_RATING', 'VALUE_FOR_MONEY', 'PROS', 'CONS', 'ORIGINAL_SOURCE', 'REPLY_FROM_ACCUVUE', 'PRODUCT_LINK', 'WEBSITE']
 Drop the Author column and replace it with gender of author ...
Consolidate all the comments into one column called COMMENT
Make ratings into integers
Tokenize data based on regex found from experimentation and common usage ...
Comments before tokenization at index[0]:
 Acucue 2 Contact Lenses I have used these lenses for a long time and I have to say that the service from Lens.com is great and the lenses work great for my needs!  I highly recommend them!
Comments after tokenization at index

### Add bigrams to the word tokens so that sentiments are expressed better by word tokens and word-pairs

In [3]:
require_bigrams = True
if require_bigrams:
    for i in range(len(prep_comments.tokens)):
        prep_comments.tokens[i] = prep_comments.tokens[i] + prep_comments.bigrams[i]

test_index = 0
print(f"Comments at index[{test_index}] after addition of bigrams:\n {prep_comments.tokens[test_index]}")
print(f"Comments at index[{-1}] after addition of bigrams:\n {prep_comments.tokens[-1]}")

Comments at index[0] after addition of bigrams:
 ['used', 'for', 'long', 'time', 'and', 'say', 'service', 'from', 'com', 'great', 'and', 'work', 'great', 'for', 'needs', 'highly', 'recommend', 'used-for', 'for-long', 'long-time', 'time-and', 'and-say', 'say-service', 'service-from', 'from-com', 'com-great', 'great-and', 'and-work', 'work-great', 'great-for', 'for-needs', 'needs-highly', 'highly-recommend']
Comments at index[-1] after addition of bigrams:
 ['buy', 'again', 'order', 'came', 'fast', 'without', 'any', 'issues', 'and', 'candy', 'nice', 'touch', 'buy-again', 'again-order', 'order-came', 'came-fast', 'fast-without', 'without-any', 'any-issues', 'issues-and', 'and-candy', 'candy-nice', 'nice-touch']


In [4]:
df['TOKENS'] = prep_comments.tokens

### Train gensim model to generate word embeddings
* Word embeddings vector is of size 100
* It is based on universal dictionary
* Each word/token now is expressed as a vector of 100 arbitrary, deterministic features. i.e. a word is embedded in a $R^{100$ basis space

In [5]:
# train Gensim's Word2Vec model
gensim_model = Word2Vec(sentences=prep_comments.tokens,      # corpus
                        vector_size=100,            # embedding dimension
                        window=4,                   # words before and after to take into consideration
                        sg=1,                       # use skip-gram
                        negative=5,                 # number of negative examples for each positive one
                        alpha=0.025,                # initial learning rate
                        min_alpha=0.0001,           # minimum learning rate
                        epochs=10,                   # number of passes through the data
                        min_count=1,                # words that appear less than this are removed
                        workers=4,                  # we use 1 to ensure replicability
                        seed=92                     # for replicability
                        )

### Do soem gensim validation to ensure that word embeddings have been generated

In [6]:
# extract the word embeddings from the model
word_vectors = gensim_model.wv
word_vectors.vectors.shape  # vocab_size x embeddings dimension

(33417, 100)

In [7]:
word_vectors_weights = gensim_model.wv.vectors
vocab_size, embedding_size = word_vectors_weights.shape
print("Vocabulary Size: {} - Embedding Dim: {}".format(vocab_size, embedding_size))

Vocabulary Size: 33417 - Embedding Dim: 100


In [8]:
# Some validation on the quality of the Word2Vec model
print(gensim_model.wv.most_similar('product', topn=3))
print(gensim_model.wv.most_similar('price', topn=3))
print(gensim_model.wv.most_similar('service', topn=3))
print(gensim_model.wv.most_similar('quality', topn=3))
print(gensim_model.wv.most_similar(positive=['comfort', 'fit'], negative=['dry'], topn=3))

def word2token(word):
    try:
        return gensim_model.wv.key_to_index[word]
    except KeyError:
        return 0

def token2word(token):
    return gensim_model.wv.index_to_key[token]

[('oasys-excellent', 0.8602176904678345), ('item', 0.852508544921875), ('complaints', 0.8524138927459717)]
[('reasonable', 0.8703557252883911), ('inexpensive', 0.8693474531173706), ('competition', 0.8648508787155151)]
[('prompt', 0.9417879581451416), ('support', 0.928848385810852), ('code', 0.9202567338943481)]
[('inexpensive', 0.9133619666099548), ('high', 0.8889140486717224), ('consistent', 0.8813645839691162)]
[('comfort-and', 0.8005489110946655), ('and-comfort', 0.7833516001701355), ('clarity-and', 0.7733805179595947)]


### Encode word embeddings
* Test key to index for word vectors, so we can go back and forth between word and its embedding

In [9]:
word_vectors[word_vectors.key_to_index['dry']]

array([-0.6087963 ,  0.83213675, -0.01774994,  1.0955026 , -0.709372  ,
        0.28719607, -0.17731366,  0.2512339 , -0.42672557,  0.04003854,
        0.8356113 ,  0.35474655,  0.18324469, -0.5902421 , -0.35245588,
       -0.6147679 , -0.09731842, -0.3747716 ,  0.7382623 , -1.1122829 ,
        0.86704224, -0.5590537 , -0.02057269,  0.43406862,  0.09812734,
       -0.08496324,  0.37695116,  0.3626994 , -0.07350893,  0.3663185 ,
        0.7480082 ,  0.44878685, -0.9531721 ,  0.50114197,  0.4475289 ,
        0.43834916,  0.4913415 , -0.1794017 ,  0.81263703, -0.29967096,
        0.384822  ,  0.07385941, -0.30465424, -0.06272677,  0.16529787,
        0.58321077,  0.30905285,  0.09786947, -0.37936872, -0.07588644,
        0.7403388 ,  0.14558838,  0.06595115, -0.14408989, -0.6099965 ,
        0.00377616, -0.24677503,  1.0769644 ,  0.8690996 , -0.5703846 ,
       -0.976352  ,  1.3870922 ,  0.2050694 ,  0.85799634,  0.1062097 ,
       -0.12252022,  0.5408719 , -0.74582046,  0.12647001, -1.15

### Plot the scatter matrix of word embeddings to see relative distance of words

In [10]:
# use a PCA decomposition to visualize the embeddings in 2D
def pca_scatterplot(model, words):
    pca = PCA(n_components=2, random_state=92)
    word_vectors = np.array([model[w] for w in words])
    low_dim_emb = pca.fit_transform(word_vectors)
    plt.figure(figsize=(21,10))
    plt.scatter(low_dim_emb[:,0], low_dim_emb[:,1], edgecolors='blue', c='blue')
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")

    # get the text of the plotted words
    texts = []
    for word, (x,y) in zip(words, low_dim_emb):
        texts.append(plt.text(x+0.01, y+0.01, word, rotation=0))

    # adjust the position of the labels so that they dont overlap
    adjust_text(texts)
    # show plot
    plt.show()

In [None]:
# define the tokens to use in the plot
tokens_of_interest = ['dryer', 'usual', 'service', 'great-service',  'shelf', 'awhile', 'disappointed']
print(tokens_of_interest)
# plot
#pca_scatterplot(word_vectors, tokens_of_interest)
all_tokens = prep_comments.tokens
import itertools
flat_list_tokens = list(itertools.chain(*all_tokens))
all_tks = list(set(flat_list_tokens))
pca_scatterplot(word_vectors, all_tks[:1000])

['dryer', 'usual', 'service', 'great-service', 'shelf', 'awhile', 'disappointed']
