# Metadata &mdash; IN PROGRESS

```
Course:   DS 5001
Module:   09 Lab
Topic:    Using SVD
Author:   R.C. Alvarado

Purpose:  We create word vectors by applying a singular value decomposition to a pointwise mutual information word-word matrix. 
```

# Set Up

In [1]:
data_home = "../data"
data_prefix = 'novels'
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']

In [2]:
import pandas as pd
import numpy as np
import scipy as sp

# Import Data

In [3]:
TOKENS = pd.read_csv(f'{data_home}/{data_prefix}/{data_prefix}-CORPUS_STANFORD.csv').set_index(OHCO)

In [4]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,term_str,pos
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
adventures,1,0,1,0,a,DT
adventures,1,0,1,1,scandal,NN
adventures,1,0,1,2,in,IN
adventures,1,0,1,3,bohemia,NN
adventures,1,1,0,0,i,PRP
...,...,...,...,...,...,...
usher,1,47,4,68,of,IN
usher,1,47,4,69,the,DT
usher,1,47,4,70,house,NN
usher,1,47,4,71,of,IN


# Gather Sentences

In [5]:
SENTS = TOKENS.groupby(OHCO[:-1]).apply(lambda x: x.term_str.tolist()).reset_index(drop=True)

# Create Windows

In [6]:
w = 2
q = (2 * w) + 1
spad = ['<s>' for i in range(w)]
PAIRS = []

In [7]:
def get_pairs(row_id, row):
    sent = spad + row + spad
    for i in range(len(sent) - q):
        for j, s in enumerate(sent[i:q+i]):
            PAIRS.append((row_id, i, j - w, s))

In [8]:
%time _ = [get_pairs(i, sent) for i, sent in enumerate(SENTS)]

CPU times: user 1.43 s, sys: 138 ms, total: 1.57 s
Wall time: 1.57 s


In [167]:
PAIRS[:q*3]

[(0, 0, -2, '<s>'),
 (0, 0, -1, '<s>'),
 (0, 0, 0, 'a'),
 (0, 0, 1, 'scandal'),
 (0, 0, 2, 'in'),
 (0, 1, -2, '<s>'),
 (0, 1, -1, 'a'),
 (0, 1, 0, 'scandal'),
 (0, 1, 1, 'in'),
 (0, 1, 2, 'bohemia'),
 (0, 2, -2, 'a'),
 (0, 2, -1, 'scandal'),
 (0, 2, 0, 'in'),
 (0, 2, 1, 'bohemia'),
 (0, 2, 2, '<s>')]

In [9]:
SKIPGRAMS = pd.DataFrame(PAIRS, columns=['bag_id', 'window_id', 'offset','term_str'])\
    .set_index(['bag_id','window_id','offset'])\
    .unstack()
SKIPGRAMS.columns = SKIPGRAMS.columns.droplevel(0)
SKIPGRAMS = SKIPGRAMS.reset_index().set_index(['bag_id','window_id', 0]).stack().to_frame('probe')
SKIPGRAMS.index.names = ['bag_id','window_id','target','distance']

In [10]:
SG = SKIPGRAMS.reset_index().value_counts(['target','probe','distance']).to_frame('n').sort_index().reset_index()

In [11]:
SG

Unnamed: 0,target,probe,distance,n
0,a,<s>,-2,2137
1,a,<s>,-1,1087
2,a,<s>,2,1801
3,a,a,-2,110
4,a,a,2,110
...,...,...,...,...
1744648,ça,y,1,1
1744649,émeutes,prefect,2,1
1744650,émeutes,serious,-1,1
1744651,émeutes,several,-2,1


# Add Weights (as GloVe does)

In [12]:
SG.query("target == 'the' and distance == 1").sort_values('n', ascending=False).head(10)

Unnamed: 0,target,probe,distance,n
1436866,the,door,1,1059
1452613,the,same,1,878
1452250,the,room,1,781
1442276,the,house,1,749
1439366,the,first,1,741
1447841,the,other,1,690
1434694,the,count,1,536
1446512,the,most,1,521
1432546,the,castle,1,518
1456996,the,time,1,480


In [13]:
SG.query("target == 'castle' and distance == -1").sort_values('n', ascending=False).head(10)

Unnamed: 0,target,probe,distance,n
264134,castle,the,-1,433
264148,castle,this,-1,19
263890,castle,his,-1,19
263681,castle,a,-1,12
264000,castle,old,-1,11
263754,castle,blaize,-1,6
264244,castle,your,-1,5
263975,castle,my,-1,5
264130,castle,that,-1,3
263864,castle,grahams,-1,3


In [14]:
SG['glove_weight'] = np.abs(1 / SG['distance']) * SG['n']

In [15]:
SG.sort_values('glove_weight', ascending=False).query("distance == 1").head(20)

Unnamed: 0,target,probe,distance,n,glove_weight
1030601,of,the,1,10256,10256.0
743444,in,the,1,6230,6230.0
1531326,to,the,1,4750,4750.0
1045569,on,the,1,3140,3140.0
789390,it,was,1,3078,3078.0
1517031,to,be,1,2910,2910.0
98928,and,the,1,2797,2797.0
151318,at,the,1,2653,2653.0
709950,i,have,1,2355,2355.0
785222,it,is,1,2189,2189.0


In [169]:
SG.value_counts('target').sort_index()

target
a            16917
aback           17
abaft            7
abandon        103
abandoned      150
             ...  
zum              8
zuniga           4
à               12
ça               7
émeutes          4
Length: 26263, dtype: int64

# Get Unigram Probabilities

In [170]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'

In [184]:
VOCAB.loc['<s>', 'n'] = len(SENTS) * 4 # ?

In [185]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()

In [186]:
VOCAB.sort_index()

Unnamed: 0_level_0,n,p
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
<s>,337128.0,1.832865e-01
a,28533.0,1.551255e-02
aback,9.0,4.893033e-06
abaft,2.0,1.087341e-06
abandon,44.0,2.392149e-05
...,...,...
à,3.0,1.631011e-06
æt,1.0,5.436703e-07
ætat,1.0,5.436703e-07
ça,2.0,1.087341e-06


In [187]:
VOCAB.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
<s>,337128.0,0.183286
the,85329.0,0.046391
to,45176.0,0.024561
and,44991.0,0.02446
of,42638.0,0.023181


# Get $P(x)$

In [188]:
p_x = VOCAB.p

In [189]:
p_x.sort_values(ascending=False).head()

term_str
<s>    0.183286
the    0.046391
to     0.024561
and    0.024460
of     0.023181
Name: p, dtype: float64

# Compute Normalized PMI for Skipgrams

**PMI**

$log \dfrac{P(x,y)}{P(x)P(y)}$

**NMPI**

$\dfrac{log\dfrac{P(x,y)}{P(x)P(y)}}{-log P(x,y)}$

See [G. Bouma 2009, eq. 7](https://pdfs.semanticscholar.org/1521/8d9c029cbb903ae7c729b2c644c24994c201.pdf)

# Get Skipgram Types

In [190]:
SG2 = SG.groupby(['target','probe']).probe.count()\
    .to_frame().rename(columns={'probe':'n'})\
    .reset_index().set_index(['target','probe'])

In [191]:
SG2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
a,<s>,3
a,a,2
a,abandon,2
a,abandons,1
a,abated,1
a,abatement,1
a,abbess,2
a,abbey,1
a,abbot,2
a,abelwhite,1


# Compute $P(x,y)$

In [192]:
N = SG2.n.sum() + len(VOCAB)**2

In [193]:
SG2['p_xy'] = SG2.n / N

In [194]:
SG2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1
a,<s>,3,3.98901e-09
a,a,2,2.65934e-09
a,abandon,2,2.65934e-09
a,abandons,1,1.32967e-09
a,abated,1,1.32967e-09
a,abatement,1,1.32967e-09
a,abbess,2,2.65934e-09
a,abbey,1,1.32967e-09
a,abbot,2,2.65934e-09
a,abelwhite,1,1.32967e-09


# Compute $PMI(x;y)$

In [195]:
SG2['pmi_xy'] = SG2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [196]:
SG2.sort_values('pmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
unburned,margins,1,1.32967e-09,8.411511
multifarious,slough,1,1.32967e-09,8.411511
marstons,spinney,1,1.32967e-09,8.411511
belshazzars,abrahams,1,1.32967e-09,8.411511
buffalo,wallowed,1,1.32967e-09,8.411511
viscount,viscountess,1,1.32967e-09,8.411511
vatican,cameos,1,1.32967e-09,8.411511
belphegor,machiavelli,1,1.32967e-09,8.411511
einen,natur,1,1.32967e-09,8.411511
belphegor,gresset,1,1.32967e-09,8.411511


In [197]:
SG2['npmi_xy'] = SG2.pmi_xy / -( np.log(SG2.p_xy) )

In [198]:
SG2.sort_values('npmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rien,cest,2,2.65934e-09,8.411511,0.426003
nunc,fracto,2,2.65934e-09,8.411511,0.426003
loeuvre,cest,2,2.65934e-09,8.411511,0.426003
charly,magne,2,2.65934e-09,8.411511,0.426003
magne,charly,2,2.65934e-09,8.411511,0.426003
nunc,patria,2,2.65934e-09,8.411511,0.426003
patria,nunc,2,2.65934e-09,8.411511,0.426003
fracto,nunc,2,2.65934e-09,8.411511,0.426003
cest,rien,2,2.65934e-09,8.411511,0.426003
marquand,wholeman,2,2.65934e-09,8.411511,0.426003


# Keep only positives

In [112]:
SG2.loc[SG2.npmi_xy < 0, 'pnpmi_xy'] = 0
SG2.loc[SG2.npmi_xy >= 0, 'pnpmi_xy'] =  SG2.npmi_xy

In [113]:
SG2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,<s>,3,3.98901e-09,-13.476912,-0.696851,0.0
a,a,2,2.65934e-09,-11.412975,-0.578013,0.0
a,abandon,2,2.65934e-09,-4.938348,-0.250104,0.0
a,abandons,1,1.32967e-09,-2.540453,-0.124298,0.0
a,abated,1,1.32967e-09,-4.332213,-0.211965,0.0


# Create PNPMI Matrix

In [114]:
SGM = SG2.npmi_xy.unstack().fillna(0)

In [115]:
SGM.head()

probe,<s>,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,-0.696851,-0.578013,0.0,0.0,-0.250104,0.0,0.0,-0.124298,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aback,-0.318711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,-0.362067,-0.275536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,-0.384576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
SGM.loc['woman'].sort_values(ascending=False).head(10)

probe
reformed         0.101509
silhouetted      0.101509
grabs            0.101509
unmentionable    0.101509
stabbing         0.067595
lacleur          0.067595
faultless        0.067595
blonde           0.067595
complexioned     0.067595
laundry          0.047757
Name: woman, dtype: float64

In [120]:
SG2.loc['prussian'].sort_values('n', ascending=False)

Unnamed: 0_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<s>,1,1.32967e-09,-4.316707,-0.211206,0.0
franco,1,1.32967e-09,8.411511,0.411556,0.411556
the,1,1.32967e-09,-2.942759,-0.143982,0.0
war,1,1.32967e-09,4.148831,0.202993,0.202993


# SVD

In [121]:
from scipy import sparse
import scipy.sparse.linalg as linalg

In [122]:
sparse = sparse.csr_matrix(SGM.values)

In [125]:
# SVD = linalg.svds(sparse, k=256)
# U, S, V = SVD

In [148]:
U, S, V = linalg.svds(sparse, k=256)

In [126]:
U.shape, S.shape, V.shape

((26263, 256), (256,), (256, 27319))

In [149]:
word_vecs = U #U.dot(V) # U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

In [150]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [151]:
WE.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,0.001957,-0.003753,-0.001006,0.002309,-0.000852,0.001551,-7.4e-05,-0.00082,-0.003564,0.002296,...,-0.129799,0.141087,-0.128295,0.110994,-0.001522,-0.000623,-0.08934,-0.134086,-0.152443,0.115638
aback,-0.050649,-0.017379,0.032329,-0.008904,0.044806,0.009325,-0.010766,0.002045,0.013533,-0.01023,...,0.010272,-0.05736,0.016806,0.024349,0.002004,-0.288737,0.064215,-0.076994,0.052267,0.026633
abaft,0.045691,0.008232,-0.014873,0.030345,-0.020468,-0.03705,-0.019306,-0.034767,-0.00115,0.021679,...,0.203292,0.35779,-0.193397,-0.166884,-0.22034,0.216153,0.162084,-0.08538,0.036732,0.014258
abandon,0.030902,0.024955,-0.004986,0.046249,-0.028428,0.010879,-0.031449,-0.038656,-0.039592,0.059304,...,-0.01994,0.016265,-0.117609,-0.093523,0.033993,-0.013715,0.006835,-0.056007,0.072498,0.049694
abandoned,0.032134,-0.090266,-0.0071,-0.063397,0.021494,0.025242,0.022419,0.099457,0.102039,-0.000866,...,-0.010554,0.042848,-0.10867,-0.075067,0.080976,-0.029122,-0.043751,-0.025981,0.074454,0.05667


In [132]:
# def word_sims(word, n=10):
#     try:
#         sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
#         return sims
#     except KeyError as e:
#         print('Word "{}" not in vocabulary.'.format(word))
#         return None

# def word_sim_report(word):
#     sims = word_sims(word)
#     for sim_word, score in sims:
#         context = ' '.join(SG2.loc[sim_word].index.values.tolist()[:5])
#         print("{} ({}) {}".format(sim_word.upper(), score, context))
#         print('-'*80)

In [147]:
# print(word_sims('man'))

In [146]:
# word_sim_report('woman')

In [145]:
# word_sim_report('man')

# Define some semantic functions

In [136]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [139]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    return SGM.loc[term_str].values.reshape(-1, 1).T

def get_nearest_vector(wv, method='cosine', n=1):
    """Get the nearest word vectors to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        eds = euclidean_distances(SGM.values, wv)
        sims = 1 - (eds/eds.max())
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n+1).iloc[1:]

def get_sims(term_str, method='cosine', n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims =  get_nearest_vector(wv, method=method, n=n) 
    return sims

def get_analogy(a, b, c, method='cosine'):
    """Infer missing analogical term"""
    print()
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        C = get_word_vector(c)
        D = np.add(np.subtract(B, A), C)
        X = get_nearest_vector(C, method=method, n=1)
        return X.iloc[0].name
    except ValueError as e:
        print(e)
        return None

def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, n=1, method=method)
    return X

In [153]:
get_sims('woman')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
girl,0.65466
family,0.644767
lucy,0.641948
fellow,0.641659
turn,0.63989
object,0.637495
sister,0.637196
people,0.636239
gentleman,0.635682
knowledge,0.63518


In [154]:
get_sims('love')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
believe,0.647324
poor,0.622738
sister,0.620528
unhappy,0.616758
why,0.616114
truth,0.615704
doubt,0.612861
lucy,0.612605
trust,0.612133
known,0.610654


In [155]:
get_analogy('man','boy','girl')




'lucy'

In [156]:
get_analogy('male', 'king', 'female')




'injury'