<a href="https://colab.research.google.com/github/nmcphers128/DS5001_FINALProj_nhm5as/blob/main/FinalProj_M09_03_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module 9: Word2Vec -- Neil McPherson Final Proj work on Word Embeddings in Sci Fi Novels as a corpus

* DS 5001
* from the orig by Raf Alvarado and modified for finalProj by Neil McPherson

We create word embeddings with novel data using word2vec and visualize results with tSNE.

# Set Up

## Configuration

In [None]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
BAG = OHCO[:2] # Paragraphs
# BAG = OHCO[:5] # Sentences
window = 5
input_dir = "/content/drive/MyDrive/DS5001_finalProj/outputFiles/"

In [None]:
# prompt: connect google drive

from google.colab import drive
#drive.mount('/content/drive')


## Imports

In [None]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
#!pip install plotly_express
import plotly_express as px

In [None]:
%matplotlib inline

# Process

## Import TOKENS and convert to a corpus for Gensim

We import data from the TOKEN table of the novels corpus, excluding proper nouns.

In [None]:
TOKENS = pd.read_csv(input_dir + 'TOKEN2.csv').set_index(OHCO)

In [None]:
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,term_id
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201,1,1,0,0,"('I', 'PRP')",PRP,I,i,13814
201,1,1,0,1,"('call', 'VBP')",VBP,call,call,4124
201,1,1,0,2,"('our', 'PRP$')",PRP$,our,our,19395
201,1,1,0,3,"('world', 'NN')",NN,world,world,31242
201,1,1,0,4,"('Flatland,', 'NNP')",NNP,"Flatland,",flatland,10824


In [None]:
corpus = TOKENS[~TOKENS.pos.str.match('NNPS?')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()

In [None]:
#corpus

## Generate word embeddings with Gensim's library

In [None]:
model = word2vec.Word2Vec(corpus, vector_size=246, window=window, min_count=200, workers=4)

## Visualize with tSNE

### Generate coordinates to plot

In [None]:
words = list(model.wv.index_to_key)   ### Neils hack to get the words out as they eliminated the attribute for
coords = pd.DataFrame(index=range(len(words)))


In [None]:
coords['label'] = [w for w in words]
coords['vector'] = coords['label'].apply(lambda x: model.wv.get_vector(x))

In [None]:
coords.head()

Unnamed: 0,label,vector
0,the,"[0.09223459, 0.23895992, 0.43558276, -0.125412..."
1,and,"[-0.19047944, 0.16385114, 0.013735715, -0.0535..."
2,of,"[-0.19079347, 0.29703873, 0.16622424, -0.12320..."
3,to,"[-0.72444695, -0.40828565, 0.2088216, -0.00032..."
4,a,"[-0.1588598, -0.33320782, -0.6935593, -0.27226..."


In [None]:
X = coords['vector'].tolist()

In [None]:
import numpy as np
X = np.array(X)

### Use ScikitLearn's TSNE library

In [None]:
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)


In [None]:
tsne_values = tsne_model.fit_transform(X)

In [None]:
coords['x'] = tsne_values[:,0]
coords['y'] = tsne_values[:,1]

In [None]:
coords.head()

Unnamed: 0,label,vector,x,y
0,the,"[0.09223459, 0.23895992, 0.43558276, -0.125412...",-9.934068,-6.38476
1,and,"[-0.19047944, 0.16385114, 0.013735715, -0.0535...",4.000057,6.383457
2,of,"[-0.19079347, 0.29703873, 0.16622424, -0.12320...",-9.777105,-0.03844
3,to,"[-0.72444695, -0.40828565, 0.2088216, -0.00032...",-5.506907,11.255379
4,a,"[-0.1588598, -0.33320782, -0.6935593, -0.27226...",-5.289559,-1.910845


### Plot the coordinates

In [None]:
px.scatter(coords, 'x', 'y', text='label', height=1000).update_traces(mode='text')

# Semantic Algebra

## Analogy

$A : B :: C : D? \rightarrow B - A + C = D$


In [None]:

def complete_analogy(A, B, C, n=2):
    try:
        return model.wv.most_similar(positive=[B, C], negative=[A])[0:n]
    except KeyError as e:
        print('Error:', e)
        return None

In [None]:
complete_analogy('man', 'boy', 'woman')

[('dear', 0.8591245412826538), ('friend', 0.7405809760093689)]

In [None]:
complete_analogy('woman', 'daughter', 'man')

Error: "Key 'daughter' not present in vocabulary"


In [None]:
complete_analogy('man', 'ship', 'woman')

[('vessel', 0.5515872836112976), ('boat', 0.5395776033401489)]

In [None]:
complete_analogy('ship', 'vessel', 'man')

[('terrible', 0.5774155855178833), ('fellow', 0.5292904376983643)]

In [None]:
model.wv.most_similar('being')

[('existence', 0.697110652923584),
 ('nature', 0.6315579414367676),
 ('means', 0.6276397109031677),
 ('fact', 0.5904484987258911),
 ('weight', 0.590161144733429),
 ('state', 0.5879693627357483),
 ('natural', 0.5725457668304443),
 ('known', 0.5660088658332825),
 ('perfect', 0.55817711353302),
 ('human', 0.5453854203224182)]

In [None]:
model.wv.most_similar('man')

[('woman', 0.7789977192878723),
 ('fellow', 0.6721122860908508),
 ('young', 0.6714513301849365),
 ('creature', 0.6433677673339844),
 ('poor', 0.5994147658348083),
 ('voice', 0.5865850448608398),
 ('who', 0.5801407098770142),
 ('old', 0.5549318790435791),
 ('word', 0.5499228835105896),
 ('he', 0.5436770915985107)]

In [None]:
model.wv.most_similar(['closed','open'], ['stopped'])

[('wall', 0.6624631285667419),
 ('across', 0.6570006608963013),
 ('opening', 0.6556872725486755),
 ('shadows', 0.630023181438446),
 ('edge', 0.6174498796463013),
 ('sand', 0.6002607941627502),
 ('forest', 0.6001043319702148),
 ('bed', 0.5979138612747192),
 ('hole', 0.5973831415176392),
 ('glass', 0.596697986125946)]

In [None]:
model.wv.most_similar(positive=['man'], negative=['woman'])

[('himself', 0.41914764046669006),
 ('he', 0.37353360652923584),
 ('his', 0.28568923473358154),
 ('him', 0.28325802087783813),
 ('day', 0.2784633934497833),
 ('even', 0.2715204060077667),
 ('colonists', 0.2471391260623932),
 ('time', 0.2441082000732422),
 ('nothing', 0.24226224422454834),
 ('still', 0.24155829846858978)]

In [None]:
model.wv.most_similar(positive=['captain'], negative=['woman'])

[('next', 0.5530993342399597),
 ('engineer', 0.5208451747894287),
 ('wait', 0.5076542496681213),
 ('replied', 0.49187171459198),
 ('continued', 0.4848463535308838),
 ('reporter', 0.45916566252708435),
 ('going', 0.4522090554237366),
 ('shall', 0.44410011172294617),
 ('yes', 0.4331345558166504),
 ('will', 0.4325847327709198)]

In [None]:
model.wv.most_similar('lake')

[('waters', 0.9212956428527832),
 ('mountain', 0.8980160355567932),
 ('ocean', 0.8507169485092163),
 ('valley', 0.8431262969970703),
 ('narrow', 0.841944694519043),
 ('stream', 0.8364641666412354),
 ('shore', 0.8356046080589294),
 ('rays', 0.832175612449646),
 ('river', 0.8310551047325134),
 ('north', 0.8119277954101562)]

In [None]:
model.wv.most_similar_cosmul('vessel')

[('coast', 0.8662401437759399),
 ('projectile', 0.8630532026290894),
 ('point', 0.8504204750061035),
 ('island', 0.8387320637702942),
 ('nautilus', 0.8379210829734802),
 ('moon', 0.8253390192985535),
 ('ship', 0.814764678478241),
 ('distance', 0.80967777967453),
 ('shore', 0.8041897416114807),
 ('city', 0.8021414875984192)]

In [None]:
model.wv.most_similar(['green'],['blue'])

[('men', 0.7213428020477295),
 ('people', 0.7139459252357483),
 ('women', 0.672744870185852),
 ('creatures', 0.6370847225189209),
 ('those', 0.6352526545524597),
 ('others', 0.6081732511520386),
 ('animals', 0.605560302734375),
 ('whom', 0.5824300050735474),
 ('things', 0.5411033034324646),
 ('who', 0.5398451685905457)]

In [None]:
model.wv.most_similar(['hours'],['minutes'])

[('surely', 0.6285554766654968),
 ('unto', 0.6269900798797607),
 ('alway', 0.5990321636199951),
 ('truly', 0.5822149515151978),
 ('somewhat', 0.5769465565681458),
 ('yet', 0.5734373331069946),
 ('knowledge', 0.5638192892074585),
 ('because', 0.510525107383728),
 ('ever', 0.5064544081687927),
 ('utter', 0.503858745098114)]

In [None]:
model.wv.most_similar(['man'],['gentleman'])