# Metadata &mdash; IN PROGRESS

```
Course:   DS 5001
Module:   09 Lab
Topic:    Using SVD
Author:   R.C. Alvarado

Purpose:  We create word vectors by applying a singular value decomposition to a pointwise mutual information word-word matrix. 
```

# Set Up

In [2]:
data_home = "../data"
data_prefix = 'novels'
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']

In [3]:
import pandas as pd
import numpy as np
import scipy as sp

# Import Data

In [5]:
TOKENS = pd.read_csv(f'{data_home}/{data_prefix}/{data_prefix}-CORPUS_STANFORD.csv').set_index(OHCO)

In [6]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,term_str,pos
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
adventures,1,0,1,0,a,DT
adventures,1,0,1,1,scandal,NN
adventures,1,0,1,2,in,IN
adventures,1,0,1,3,bohemia,NN
adventures,1,1,0,0,i,PRP
...,...,...,...,...,...,...
usher,1,47,4,68,of,IN
usher,1,47,4,69,the,DT
usher,1,47,4,70,house,NN
usher,1,47,4,71,of,IN


# Gather Sentences

In [10]:
SENTS = TOKENS.groupby(OHCO[:-1]).apply(lambda x: x.term_str.tolist()).reset_index(drop=True)

# Create Windows

In [11]:
w = 2
q = (2 * w) + 1
spad = ['<s>' for i in range(w)]
PAIRS = []

In [12]:
def get_pairs(row_id, row):
    sent = spad + row + spad
    for i in range(len(sent) - q):
        for j, s in enumerate(sent[i:q+i]):
            PAIRS.append((row_id, i, j - w, s))

In [13]:
%time _ = [get_pairs(i, sent) for i, sent in enumerate(SENTS)]

CPU times: user 1.44 s, sys: 138 ms, total: 1.58 s
Wall time: 1.58 s


In [16]:
SKIPGRAMS = pd.DataFrame(PAIRS, columns=['bag_id', 'window_id', 'offset','term_str'])\
    .set_index(['bag_id','window_id','offset'])\
    .unstack()
SKIPGRAMS.columns = SKIPGRAMS.columns.droplevel(0)
SKIPGRAMS = SKIPGRAMS.reset_index().set_index(['bag_id','window_id', 0]).stack().to_frame('probe')
SKIPGRAMS.index.names = ['bag_id','window_id','target','distance']

In [17]:
SG = SKIPGRAMS.reset_index().value_counts(['target','probe','distance']).to_frame('n').sort_index().reset_index()

In [21]:
SG

Unnamed: 0,target,probe,distance,n
0,a,<s>,-2,2137
1,a,<s>,-1,1087
2,a,<s>,2,1801
3,a,a,-2,110
4,a,a,2,110
...,...,...,...,...
1744648,ça,y,1,1
1744649,émeutes,prefect,2,1
1744650,émeutes,serious,-1,1
1744651,émeutes,several,-2,1


# Add Weights (as GloVe does)

In [25]:
SG.query("target == 'the' and distance == 1").sort_values('n', ascending=False).head(10)

Unnamed: 0,target,probe,distance,n
1436866,the,door,1,1059
1452613,the,same,1,878
1452250,the,room,1,781
1442276,the,house,1,749
1439366,the,first,1,741
1447841,the,other,1,690
1434694,the,count,1,536
1446512,the,most,1,521
1432546,the,castle,1,518
1456996,the,time,1,480


In [26]:
SG.query("target == 'castle' and distance == -1").sort_values('n', ascending=False).head(10)

Unnamed: 0,target,probe,distance,n
264134,castle,the,-1,433
264148,castle,this,-1,19
263890,castle,his,-1,19
263681,castle,a,-1,12
264000,castle,old,-1,11
263754,castle,blaize,-1,6
264244,castle,your,-1,5
263975,castle,my,-1,5
264130,castle,that,-1,3
263864,castle,grahams,-1,3


In [27]:
SG['glove_weight'] = np.abs(1 / SG['distance']) * SG['n']

In [28]:
SG.sort_values('glove_weight', ascending=False).query("distance == 1").head(20)

Unnamed: 0,target,probe,distance,n,glove_weight
1030601,of,the,1,10256,10256.0
743444,in,the,1,6230,6230.0
1531326,to,the,1,4750,4750.0
1045569,on,the,1,3140,3140.0
789390,it,was,1,3078,3078.0
1517031,to,be,1,2910,2910.0
98928,and,the,1,2797,2797.0
151318,at,the,1,2653,2653.0
709950,i,have,1,2355,2355.0
785222,it,is,1,2189,2189.0


# Get Unigram Probabilities

In [29]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'

In [33]:
VOCAB.loc['<s>', 'n'] = len(SENTS) * 4 # Not sure about this

In [34]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum() # Is the denominator correct?

In [32]:
VOCAB.sort_index()

Unnamed: 0_level_0,n,p
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
<s>,337128.0,1.832865e-01
a,28533.0,1.551255e-02
aback,9.0,4.893033e-06
abaft,2.0,1.087341e-06
abandon,44.0,2.392149e-05
...,...,...
à,3.0,1.631011e-06
æt,1.0,5.436703e-07
ætat,1.0,5.436703e-07
ça,2.0,1.087341e-06


In [319]:
VOCAB.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
<s>,337128.0,0.183286
the,85329.0,0.046391
to,45176.0,0.024561
and,44991.0,0.02446
of,42638.0,0.023181


# Get $P(x)$

In [321]:
p_x = VOCAB.p

In [322]:
p_x.sort_values(ascending=False).head()

term_str
<s>    0.183286
the    0.046391
to     0.024561
and    0.024460
of     0.023181
Name: p, dtype: float64

# Compute Normalized PMI for Skipgrams

**PMI**

$log \dfrac{P(x,y)}{P(x)P(y)}$

**NMPI**

$\dfrac{log\dfrac{P(x,y)}{P(x)P(y)}}{-log P(x,y)}$

See [G. Bouma 2009, eq. 7](https://pdfs.semanticscholar.org/1521/8d9c029cbb903ae7c729b2c644c24994c201.pdf)

# Create compressed skipgram table

In [323]:
SG2 = SG.groupby(['target','probe']).probe.count()\
    .to_frame().rename(columns={'probe':'n'})\
    .reset_index().set_index(['target','probe'])

In [324]:
SG2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
a,<s>,3
a,a,2
a,abandon,2
a,abandons,1
a,abated,1
a,abatement,1
a,abbess,2
a,abbey,1
a,abbot,2
a,abelwhite,1


### Compute $P(x,y)$

In [325]:
N = SG2.n.sum()

In [326]:
SG2['p_xy'] = SG2.n / N

In [327]:
SG2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1
a,<s>,3,1.71954e-06
a,a,2,1.14636e-06
a,abandon,2,1.14636e-06
a,abandons,1,5.731799e-07
a,abated,1,5.731799e-07
a,abatement,1,5.731799e-07
a,abbess,2,1.14636e-06
a,abbey,1,5.731799e-07
a,abbot,2,1.14636e-06
a,abelwhite,1,5.731799e-07


### Compute $PMI(x;y)$

In [328]:
SG2['pmi_xy'] = SG2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [329]:
SG2.sort_values('pmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rosss,caterers,1,5.731799e-07,14.477779
ac,simul,1,5.731799e-07,14.477779
oegipans,satyrs,1,5.731799e-07,14.477779
lorrain,margaretta,1,5.731799e-07,14.477779
innocui,furores,1,5.731799e-07,14.477779
unwarlike,harried,1,5.731799e-07,14.477779
diresti,sospirando,1,5.731799e-07,14.477779
diresti,ripentita,1,5.731799e-07,14.477779
diresti,riamando,1,5.731799e-07,14.477779
diresti,amato,1,5.731799e-07,14.477779


In [330]:
SG2['npmi_xy'] = SG2.pmi_xy / -( np.log(SG2.p_xy) )

In [331]:
SG2.sort_values('npmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fracto,nunc,2,1e-06,14.477779,1.058401
patria,nunc,2,1e-06,14.477779,1.058401
charly,magne,2,1e-06,14.477779,1.058401
rien,cest,2,1e-06,14.477779,1.058401
loeuvre,cest,2,1e-06,14.477779,1.058401


### Keep only positives

Changed since lab.

In [332]:
SG2.loc[SG2.npmi_xy < 0, 'pnpmi_xy'] = 0
SG2.loc[SG2.npmi_xy >= 0, 'pnpmi_xy'] =  SG2.npmi_xy

In [333]:
SG2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,<s>,3,1.71954e-06,-7.410643,-0.558306,0.0
a,a,2,1.14636e-06,-5.346707,-0.390872,0.0
a,abandon,2,1.14636e-06,1.12792,0.082457,0.082457
a,abandons,1,5.731799e-07,3.525816,0.245324,0.245324
a,abated,1,5.731799e-07,1.734056,0.120655,0.120655


## Create PNPMI Matrix

In [38]:
SGM = SG2.npmi_xy.unstack().fillna(0)

NameError: name 'SG2' is not defined

In [335]:
SGM.head()

probe,<s>,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,-0.558306,-0.390872,0.0,0.0,0.082457,0.0,0.0,0.245324,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aback,-0.031148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,-0.070518,0.030251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,-0.103314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [336]:
SGM.loc['man'].sort_values(ascending=False).head()

probe
ennuyé          0.466298
recognizes      0.466298
specifically    0.466298
incites         0.466298
cackled         0.466298
Name: man, dtype: float64

In [337]:
SG2.loc['prussian'].sort_values('n', ascending=False)

Unnamed: 0_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<s>,1,5.731799e-07,1.749561,0.121733,0.121733
franco,1,5.731799e-07,14.477779,1.007355,1.007355
the,1,5.731799e-07,3.12351,0.217332,0.217332
war,1,5.731799e-07,10.215099,0.710761,0.710761


## SVD

In [35]:
from scipy import sparse
import scipy.sparse.linalg as linalg

In [36]:
sparse = sparse.csr_matrix(SGM.values)

NameError: name 'SGM' is not defined

In [37]:
SVD = linalg.svds(sparse, k=256)

ValueError: not enough values to unpack (expected 2, got 0)

In [341]:
U, S, V = SVD

In [342]:
U.shape, S.shape, V.shape

((26263, 256), (256,), (256, 27319))

In [353]:
word_vecs = U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

ValueError: operands could not be broadcast together with shapes (26263,256) (27319,256) 

In [None]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [None]:
WE.head()

In [None]:
def word_sims(word, n=10):
    try:
        sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
        return sims
    except KeyError as e:
        print('Word "{}" not in vocabulary.'.format(word))
        return None

In [None]:
print(word_sims('happy'))

In [None]:
def word_sim_report(word):
    sims = word_sims(word)
    for sim_word, score in sims:
        context = ' '.join(SG2.loc[sim_word].index.values.tolist()[:5])
        print("{} ({}) {}".format(sim_word.upper(), score, context))
        print('-'*80)

In [None]:
word_sim_report('woman')

In [None]:
word_sim_report('man')

In [None]:
word_sim_report('young')

## Define some semantic functions

Added after lecture.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    return SGM.loc[term_str].values.reshape(-1, 1).T

def get_nearest_vector(wv, method='cosine', n=1):
    """Get the nearest word vectors to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        eds = euclidean_distances(SGM.values, wv)
        sims = 1 - (eds/eds.max())
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n+1).iloc[1:]

def get_sims(term_str, method='cosine', n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims =  get_nearest_vector(wv, method=method, n=n) 
    return sims

def get_analogy(a, b, c, method='cosine'):
    """Infer missing analogical term"""
    print()
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        C = get_word_vector(c)
        D = np.add(np.subtract(B, A), C)
        X = get_nearest_vector(C, method=method, n=1)
        return X.iloc[0].name
    except ValueError as e:
        print(e)
        return None

In [None]:
get_nearest_vector(get_word_vector('woman'),  n=10)

In [None]:
def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, n=1, method=method)
    return X
#     return X.iloc[0].name

In [None]:
get_sims('woman')

In [None]:
test = get_nearest_vector(get_word_vector('king'), n=10)

In [None]:
test

In [None]:
get_sims('love')

In [None]:
get_opposite('man','beard')

In [None]:
get_analogy('man','boy','girl')

In [None]:
get_analogy('male', 'king', 'female')

In [None]:
SGM