# DS 5001 Module 9 Lab: FastText

We create word embeddings with novel data using word2vec and visualize results with tSNE.

## Set Up

In [1]:
data_in = './data_in'
data_out = './data_out'
data_prefix = 'novels'

In [2]:
OHCO = ['book', 'chapter', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:4] # Paragraphs
SENT = OHCO[:5] # Sentences

In [3]:
BAG = PARA

In [4]:
import pandas as pd
import numpy as np
from gensim.models import FastText
from sklearn.manifold import TSNE
import plotly_express as px

## Process

### Import Gensim Corpus

In [5]:
DOCS = pd.read_csv('./data_out/novels-GENSIM_DOCS.csv', header=None)[0].to_list()

In [6]:
VOCAB = pd.read_csv('./data_in/novels-VOCAB.csv')

In [7]:
V = VOCAB[VOCAB.pos_max.str.match(r'^(NN[^P]|VB)')].sort_values('tfidf_max', ascending=False).head(1000)\
    .term_str.to_list()

## Generate word embeddings with Gensim's FastText

In [8]:
FastText?

[0;31mInit signature:[0m
[0mFastText[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msentences[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorpus_file[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msg[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhs[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msize[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m=[0m[0;36m0.025[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwindow[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_count[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_vocab_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mword_ngrams[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mseed[0m[0;34m=[0m[0;36m1[0m[0

In [None]:
model = FastText(sentences=DOCS)

## Visualize with tSNE

### Generate coordinates to plot

In [None]:
coords = pd.DataFrame(index=range(1000))
coords['label'] = V
coords['vector'] = coords['label'].apply(lambda x: model.wv[x])

In [None]:
coords.head()

### Use ScikitLearn's TSNE library

In [None]:
tsne_engine = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(coords['vector'].tolist())

In [None]:
coords['x'] = tsne_model[:,0]
coords['y'] = tsne_model[:,1]

In [None]:
coords.head()

### Plot the coordinates

In [None]:
px.scatter(coords, 'x', 'y', text='label', height=1000).update_traces(mode='text')

## Semantic Algebra

### Analogies

$A : B :: C : D? \rightarrow B - A + C = D$


In [None]:
def complete_analogy(A, B, C, n=2):
    try:
        cols = ['term', 'sim']
        return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[:n], columns=cols)
    except KeyError as e:
        print('Error:', e)
        return None
    
def get_most_similar(positive, negative=None):
    return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])

In [None]:
complete_analogy('man', 'boy', 'woman', 3)

In [None]:
complete_analogy('girl', 'daughter', 'boy', 3)

In [None]:
complete_analogy('girl', 'sister', 'boy', 3)

In [None]:
complete_analogy('man', 'gentleman', 'woman', 5)

In [None]:
complete_analogy('woman', 'lady', 'man', 5)

In [None]:
complete_analogy('day', 'sun', 'night', 5)

### Similarites

In [None]:
get_most_similar('joy')

In [None]:
get_most_similar('man')

In [None]:
get_most_similar(['woman','girl'], ['man'])

In [None]:
get_most_similar(positive=['man'], negative=['woman'])

In [None]:
get_most_similar(positive=['woman'], negative=['girl'])

In [None]:
get_most_similar(positive='woman')

In [None]:
get_most_similar('woman')

In [None]:
get_most_similar(['woman'],['marriage'])

In [None]:
get_most_similar(['woman'],['lady'])

In [None]:
get_most_similar(['man'],['gentleman'])

## Save

In [None]:
W2V = pd.DataFrame(model.wv.vectors_norm, index=model.wv.vocab.keys())
W2V.to_csv('{}/{}-W2V.csv'.format(data_out, data_prefix))
pd.Series(DOCS).to_csv('{}/{}-GENSIM_DOCS.csv'.format(data_out, data_prefix), index=False, header=False)