# Configuration

In [1]:
#proj = '/Users/rca2t/Dropbox/Courses/DSI/DS5559/UVA_DSI_REPO'
#pwd = '{}/play/wordembedding'.format(proj)
db_file = 'corpus1.db'
#lib = "{}/lib".format(proj)

# Word Embedding
window = 3

# Libraries

In [4]:
import pandas as pd
import numpy as np
import scipy as sp
import sqlite3
#import sys; sys.path.append(lib)
import textman as tx

In [5]:
%matplotlib inline

## Extract skipgrams from tokens with SQL

### Build SQL query from configs

In [6]:
in_clause = ', '.join(['x.token_num + {0}, x.token_num - {0}'.format(i) for i in range(1, window + 1)])
pos_clause = "AND pos NOT LIKE 'NNP%' " # Remove proper nouns

In [7]:
sql = """
WITH mytoken(book, chapter, para_num, sent_num,token_num,term_str,term_id) 
AS (
    SELECT book, chapter, para_num, sent_num,token_num,term_str,term_id
    FROM token 
    WHERE term_id IN (SELECT term_id FROM vocab WHERE stop = 0) 
        AND term_str is not NULL
        {}       
)

SELECT x.term_str as target, y.term_str as probe, (y.token_num - x.token_num) AS dist
FROM mytoken x 
JOIN mytoken y USING(book, chapter, para_num, sent_num)
WHERE y.token_num IN ({})
ORDER BY target, dist, probe
""".format(pos_clause, in_clause)

### Pull from DB

In [8]:
skipgrams = tx.get_sql(sql, db_file)

In [9]:
skipgrams.head(10)

Unnamed: 0,target,probe,dist
0,aback,dont,-3
1,aback,dont,-3
2,aback,dont,-3
3,aback,dont,-3
4,aback,clearly,-2
5,aback,completely,-2
6,aback,im,-2
7,aback,sorely,-2
8,aback,took,-2
9,aback,little,-1


## Get Unigram Probabilities

### Import vocab table

In [10]:
vocab = tx.get_table('vocab', db_file, index_col=['term_id'])
vocab = vocab[vocab.stop == 0]

In [11]:
vocab.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop,df,tf_sum,tf_mean,tf_max,tfidf_sum,tfidf_mean,tfidf_max,tfth_sum,tfth_mean,tfth_max,th_sum,th_mean,th_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
21223,mr,27957,0.007315,mr,0,17.0,0.238011,0.014001,0.038059,0.0,0.0,0.0,0.336572,0.019798,0.053819,1.4141,0.083182,0.179471
28038,said,27619,0.007227,said,0,17.0,0.270791,0.015929,0.022446,0.0,0.0,0.0,0.435261,0.025604,0.036079,1.607368,0.094551,0.122946
19251,little,9798,0.002564,littl,0,17.0,0.093399,0.005494,0.009802,0.0,0.0,0.0,0.064996,0.003823,0.006821,0.6959,0.040935,0.065407
29560,sir,8364,0.002188,sir,0,17.0,0.074818,0.004401,0.010413,0.0,0.0,0.0,0.042622,0.002507,0.005932,0.569676,0.03351,0.068574
21226,mrs,8309,0.002174,mr,0,16.0,0.069298,0.004076,0.008535,0.006061,0.000357,0.000747,0.036265,0.002133,0.004467,0.523315,0.030783,0.058657


In [12]:
p_x = vocab[['term_str','p']].reset_index().set_index('term_str')['p']

In [13]:
p_x.sort_values(ascending=False).head()

term_str
mr        0.007315
said      0.007227
little    0.002564
sir       0.002188
mrs       0.002174
Name: p, dtype: float64

### Create compressed skipgram table

In [14]:
skipgrams2 = skipgrams.groupby(['target','probe']).probe.count()\
    .to_frame().rename(columns={'probe':'n'})\
    .reset_index().set_index(['target','probe'])

In [15]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
aback,abruptness,1
aback,admitted,1
aback,clearly,1
aback,completely,1
aback,creetur,1
aback,dont,4
aback,honest,1
aback,im,1
aback,lad,1
aback,lady,1


In [16]:
N = skipgrams2.n.sum()

In [17]:
skipgrams2['p_xy'] = skipgrams2.n / N

In [18]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1
aback,abruptness,1,3.733411e-07
aback,admitted,1,3.733411e-07
aback,clearly,1,3.733411e-07
aback,completely,1,3.733411e-07
aback,creetur,1,3.733411e-07
aback,dont,4,1.493364e-06
aback,honest,1,3.733411e-07
aback,im,1,3.733411e-07
aback,lad,1,3.733411e-07
aback,lady,1,3.733411e-07


### Compute PMI(x;y)

In [19]:
skipgrams2['pmi_xy'] = skipgrams2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [21]:
skipgrams2.sort_values('pmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vividness,overstate,1,3.733411e-07,15.511701
tinges,sickle,1,3.733411e-07,15.511701
belfries,vanes,1,3.733411e-07,15.511701
rump,te,1,3.733411e-07,15.511701
rummest,superlativest,1,3.733411e-07,15.511701


In [23]:
skipgrams2['npmi_xy'] = skipgrams2.pmi_xy / -( np.log(skipgrams2.p_xy) )

In [24]:
skipgrams2.sort_values('npmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ra,ra,22,8e-06,13.997573,1.19538
tink,tink,28,1e-05,13.565791,1.182867
ducky,ducky,6,2e-06,15.106236,1.161213
drip,drip,24,9e-06,13.144578,1.130938
grudged,grudged,6,2e-06,14.084585,1.082679


### Keep only positives

In [25]:
skipgrams2.loc[skipgrams2.npmi_xy < 0, 'pnpmi_xy'] = 0
skipgrams2.loc[skipgrams2.npmi_xy >= 0, 'pnpmi_xy'] =  skipgrams2.npmi_xy

In [26]:
skipgrams2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aback,abruptness,1,3.733411e-07,10.254206,0.692816,0.692816
aback,admitted,1,3.733411e-07,7.833838,0.529286,0.529286
aback,clearly,1,3.733411e-07,7.695687,0.519952,0.519952
aback,completely,1,3.733411e-07,7.670208,0.51823,0.51823
aback,creetur,1,3.733411e-07,8.154145,0.550927,0.550927


## Create PNPMI Matrix

In [27]:
SGM = skipgrams2.npmi_xy.unstack().fillna(0)

In [28]:
SGM.head()

probe,aback,abaft,abandon,abandoned,abandoning,abandonment,abandons,abase,abased,abasement,...,zebra,zenith,zephyrs,zest,zests,zig,zigzag,zigzagged,zone,zounds
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
SGM.loc['sir'].sort_values(ascending=False).head()

probe
fessor     0.459327
cub        0.459327
growen     0.459327
replied    0.458908
thankee    0.449227
Name: sir, dtype: float64

In [32]:
skipgrams2.loc['class'].sort_values('n', ascending=False)

Unnamed: 0_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
people,7,2.613387e-06,4.911750,0.382093,0.382093
man,4,1.493364e-06,3.042015,0.226771,0.226771
men,4,1.493364e-06,4.655832,0.347075,0.347075
gentlemen,4,1.493364e-06,5.023766,0.374503,0.374503
society,4,1.493364e-06,5.768969,0.430055,0.430055
practitioners,3,1.120023e-06,8.909566,0.650231,0.650231
large,3,1.120023e-06,4.830036,0.352502,0.352502
last,2,7.466821e-07,3.305604,0.234313,0.234313
poorer,2,7.466821e-07,7.726396,0.547675,0.547675
stern,2,7.466821e-07,6.439474,0.456453,0.456453


## SVD

In [33]:
sparse = sp.sparse.csr_matrix(SGM.values)

In [34]:
SVD = sp.sparse.linalg.svds(sparse, k=256)

In [35]:
U, S, V = SVD

In [36]:
word_vecs = U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

In [37]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [38]:
WE.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aback,0.101402,-6.298608e-14,0.059655,0.12075,1.482372e-14,4.370856e-14,0.066952,0.013392,6.98142e-15,0.099905,...,-0.003934,-0.070831,-7.70472e-16,0.025861,0.060325,-0.066824,-1.665949e-14,-0.011991,-0.022711,-0.040413
abaft,0.015114,1.2994e-13,-0.069192,0.004256,-8.2215e-15,3.340749e-14,0.047896,0.088055,9.698433e-15,0.101617,...,0.01119,0.046835,3.256972e-16,0.077967,-0.027554,0.003299,4.744259e-16,0.041145,-0.004012,-0.014973
abandon,-0.107764,-2.10068e-13,0.112975,0.069961,4.937431e-14,-1.058963e-14,-0.033733,-0.005372,1.150891e-15,0.000711,...,0.056889,0.037494,2.351667e-16,0.045345,0.019892,0.005662,9.406667e-16,0.020038,-0.132326,-0.063688
abandoned,0.053963,-2.301541e-13,0.154301,-0.061643,2.134106e-14,-4.067577e-14,-0.063376,-0.001141,1.340564e-15,0.022743,...,0.024186,-0.083121,-8.367278e-16,0.071454,-0.057538,0.034942,8.610199e-15,0.007148,-0.066448,-0.08384
abandoning,-0.069279,1.203558e-13,-0.075803,0.19145,9.67934e-14,-7.85541e-15,-0.121178,0.050466,9.364896e-15,0.033413,...,0.03935,0.013268,4.1824540000000003e-17,-0.014941,0.010092,0.035096,8.726121e-15,0.006764,-0.014507,-0.034802


In [39]:
def word_sims(word, n=10):
    try:
        sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
        return sims
    except KeyError as e:
        print('Word "{}" not in vocabulary.'.format(word))
        return None

In [65]:
print(word_sims('landlord'))

[['goldian' 0.6777390549583162]
 ['gammonin' 0.6777390549583162]
 ['vacillating' 0.6309072334951363]
 ['drains' 0.603512374098708]
 ['prophesy' 0.603512374098708]
 ['deputed' 0.5689989329682099]
 ['duplicate' 0.5689989329682099]
 ['reciprocated' 0.5566805526355281]
 ['piecemeal' 0.5566805526355281]
 ['potboy' 0.5566805526355281]]


In [43]:
def word_sim_report(word):
    sims = word_sims(word)
    for sim_word, score in sims:
        context = ' '.join(skipgrams2.loc[sim_word].index.values.tolist()[:5])
        print("{} ({}) {}".format(sim_word.upper(), score, context))
        print('-'*80)

In [44]:
word_sim_report('woman')

TINY (0.5746792154016084) alone arranged back beast bending
--------------------------------------------------------------------------------
LOVELY (0.5666348800604555) accomplished agreeable aid amazement amiable
--------------------------------------------------------------------------------
GROOMED (0.5619155800799732) admirably best woman
--------------------------------------------------------------------------------
GIVETH (0.5536945618811708) know married said woman
--------------------------------------------------------------------------------
OLD (0.5452844685716556) aback abandoned abbey abhorrence abide
--------------------------------------------------------------------------------
BIGODD (0.5396434137140869) nonsense woman
--------------------------------------------------------------------------------
FU (0.5396434137140869) faults woman working
--------------------------------------------------------------------------------
SEDUCES (0.5396434137140869) mankind tis woman

In [45]:
word_sim_report('man')

OLD (0.5859896183373231) aback abandoned abbey abhorrence abide
--------------------------------------------------------------------------------
YOUNG (0.5432029481037005) abroad abruptness absorbed abstinence accept
--------------------------------------------------------------------------------
OUTBIDS (0.5115249113936188) guineas man
--------------------------------------------------------------------------------
NOSED (0.4922529775050922) aquiline became blue blunt boned
--------------------------------------------------------------------------------
BLIND (0.4815405034980952) accurately adding admiration admit alley
--------------------------------------------------------------------------------
HUMP (0.48068927688075114) backed bedridden boy little man
--------------------------------------------------------------------------------
PRIM (0.4722936239545925) arrival arrived borne clean comfortable
--------------------------------------------------------------------------------
PAR

In [67]:
word_sim_report('cheapside')

Word "cheapside" not in vocabulary.


TypeError: 'NoneType' object is not iterable

In [49]:
word_sim_report('country')

SHELLY (0.5955544767690311) country lair
--------------------------------------------------------------------------------
AGENCIES (0.5955544767690311) come country good happening
--------------------------------------------------------------------------------
DOPTED (0.5955544767690311) country
--------------------------------------------------------------------------------
PASTURING (0.5955544767690311) country
--------------------------------------------------------------------------------
REGISTRIES (0.5955544767690311) country preserved
--------------------------------------------------------------------------------
UNPENSIONING (0.5955544767690311) country ungrateful
--------------------------------------------------------------------------------
RESURRECTIONS (0.5955544767690311) country makes occasional
--------------------------------------------------------------------------------
JUDICATURE (0.5955544767690311) country court supreme
------------------------------------------

## Define some semantic functions

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    return SGM.loc[term_str].values.reshape(-1, 1).T

def get_nearest_vector(wv, method='cosine', n=1):
    """Get the nearest word vectors to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        eds = euclidean_distances(SGM.values, wv)
        sims = 1 - (eds/eds.max())
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n+1).iloc[1:]

def get_sims(term_str, method='cosine', n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims =  get_nearest_vector(wv, method=method, n=n) 
    return sims

def get_analogy(a, b, d, method='cosine'):
    """Infer missing analogical term"""
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        D = get_word_vector(d)
        C = np.add(np.subtract(A, B), D)
        X = get_nearest_vector(C, method=method, n=1)
        return X.iloc[0].name
    except ValueError as e:
        print(e)
        return None

In [51]:
get_nearest_vector(get_word_vector('woman'),  n=10)

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
man,0.262905
old,0.234189
girl,0.232142
lady,0.230307
young,0.211862
said,0.208251
gentleman,0.20725
little,0.202206
child,0.201667
friend,0.196422


In [53]:
get_sims('poor')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
dear,0.229913
child,0.220793
little,0.215721
mother,0.212047
old,0.208295
young,0.2017
said,0.201665
good,0.196738
girl,0.192567
man,0.19204


In [54]:
def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, n=1, method=method)
    return X

In [56]:
get_opposite('man','rich')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
gentleman,0.209356


In [57]:
with sqlite3.connect(db_file) as db:
    tx.put_to_db(db, skipgrams, 'skipgrams', index=False, if_exists='replace')
    tx.put_to_db(db, WE, 'wordembeddings', index=True, if_exists='replace')