# Synopsis

As an aside, we demonstrate use of Pandas' correlation function, `corr()`, to get pairwise similarities of words and documents.

# Configuration

In [2]:
db_name = 'novels.db'

# Libraries

In [3]:
import sqlite3
import pandas as pd
import numpy as np

# Pragmas

In [4]:
%matplotlib inline

# Process

In [57]:
with sqlite3.connect(db_name) as db:
    vocab = pd.read_sql('SELECT * FROM vocab', db, index_col='term_id')
    tfidf = pd.read_sql('SELECT * FROM tfidf_small', db, index_col=['bag_id', 'term_id'])
    bags =  pd.read_sql('SELECT * FROM bag', db, index_col='bag_id')

## Expand TFIDF Matrix

The TFIDF matrix is stored in narrow mode in the database. We unstack it and convert the columns into terms to make the exercise easier.

In [51]:
TFIDF = tfidf.unstack()
TFIDF.columns = TFIDF.columns.droplevel(0)
TFIDF.columns = vocab.loc[TFIDF.columns].term_str

In [52]:
TFIDF.head()

term_str,able,ablewhite,absence,account,across,act,action,added,address,advanced,...,write,writing,written,wrong,year,years,yes,yesterday,young,youth
bag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.762597,0.0,0.0,0.624051,0.0,0.0,...,0.55576,0.55576,0.433268,0.0,0.522879,0.210684,0.464706,0.0,1.391807,0.0
1,0.0,0.0,0.0,0.0,1.143895,0.0,0.591336,0.0,0.536667,0.0,...,0.0,0.0,0.0,0.448245,0.0,0.210684,0.77451,1.131261,0.463936,0.0
2,0.267104,0.0,0.0,0.0,0.381298,0.0,0.0,0.0,0.536667,0.0,...,0.0,0.0,0.433268,0.448245,0.0,0.0,0.619608,1.131261,0.231968,0.0
3,0.267104,0.0,0.0,0.0,1.143895,0.0,0.0,0.0,1.610001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.210684,1.239216,0.0,0.695903,0.0
4,0.267104,0.0,0.0,0.0,0.0,0.0,0.591336,0.624051,0.0,0.0,...,0.55576,0.0,0.0,0.0,1.045757,0.0,0.619608,0.0,0.231968,0.0


## Compute Similarities

Useful discussion of the relationship between cosine similiary and correlation: 

[Brendan T. O'Connor on Cosine similarity, Pearson correlation, and OLS coefficients](https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/)

### Word-Word Comparisons

In [53]:
term_corr = TFIDF.corr()

In [54]:
term_corr.head()

term_str,able,ablewhite,absence,account,across,act,action,added,address,advanced,...,write,writing,written,wrong,year,years,yes,yesterday,young,youth
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
able,1.0,0.061072,0.141624,0.253928,0.160673,0.149674,0.183702,0.020535,0.117363,0.03254,...,0.214802,0.191193,0.217432,0.087715,0.239345,0.184937,0.200448,0.15491,0.085637,0.088826
ablewhite,0.061072,1.0,-0.009004,0.109823,-0.046098,0.09013,-0.027158,0.101558,-0.004449,0.025643,...,0.212574,0.114509,0.231814,0.133603,0.190809,-0.006915,0.008624,0.107772,0.007695,-0.032687
absence,0.141624,-0.009004,1.0,0.301882,-0.023458,0.135321,0.190657,0.162444,0.062981,0.170881,...,0.015605,0.116263,0.226704,0.077514,0.119075,0.181828,0.035777,0.119735,0.153262,0.19443
account,0.253928,0.109823,0.301882,1.0,-0.05639,0.153673,0.216019,0.119819,0.138966,0.294758,...,0.153667,0.165174,0.22674,0.148797,0.171684,0.268621,0.043725,0.14558,0.226627,0.221435
across,0.160673,-0.046098,-0.023458,-0.05639,1.0,0.053741,0.115498,-0.08661,0.121187,-0.128949,...,0.091462,0.09469,0.119586,0.113591,0.104988,0.136132,0.266985,0.112903,0.030434,-0.113305


In [55]:
def get_termlist(df, term_str, limit=15):
    try:
        list = df[term_str].sort_values(ascending=False).head(limit).reset_index()
        print(list)
    except KeyError:
        print(term_str, 'not in vocab')

In [39]:
get_termlist(term_corr, 'love')

     term_str      love
0        love  1.000000
1       heart  0.644291
2   affection  0.606222
3     passion  0.601325
4       bosom  0.543533
5        feel  0.519854
6      longer  0.519612
7        pity  0.514114
8     conceal  0.506583
9   concealed  0.499347
10    despair  0.490020
11     wishes  0.481847
12       quit  0.479313
13       stay  0.476010
14     suffer  0.470613


In [48]:
get_termlist(term_corr, 'knowledge')

       term_str  knowledge
0     knowledge   1.000000
1        single   0.423368
2         means   0.418402
3   opportunity   0.412537
4         agnes   0.394915
5       success   0.387304
6        action   0.386748
7      prepared   0.382378
8    difficulty   0.370884
9           use   0.368804
10         hand   0.366470
11         time   0.362611
12         life   0.359449
13         mine   0.357370
14         past   0.355924


In [40]:
get_termlist(term_corr, 'murder')

     term_str    murder
0      murder  1.000000
1   committed  0.834717
2        body  0.821819
3    evidence  0.748893
4      period  0.738834
5      thrown  0.716075
6        fact  0.694741
7    supposed  0.686107
8    although  0.682517
9   suspicion  0.672503
10      river  0.655437
11      found  0.639893
12      point  0.613830
13       thus  0.605615
14      known  0.591836


In [43]:
get_termlist(term_corr, 'death')

     term_str     death
0       death  1.000000
1   existence  0.504091
2     despair  0.488443
3        eyes  0.483253
4        vain  0.481634
5       grave  0.476854
6        fate  0.475554
7      horror  0.474909
8        soul  0.458681
9    strength  0.456004
10      hopes  0.444222
11        lay  0.444061
12       arms  0.441666
13       died  0.441654
14     raised  0.438287


### Doc-Doc Comparisons

In [56]:
doc_corr = TFIDF.T.corr()

In [66]:
def get_doclist(df, doc_id, limit=15):
    try:
        list = df[doc_id].sort_values(ascending=False).head(limit)
        t = bags.loc[list.index].copy()
        t['w'] = list
        print(t)
    except KeyError:
        print(doc_id, 'not in docs')

In [67]:
bags.loc[bags.author == 'poe']

Unnamed: 0_level_0,genre,author,book,chapter
bag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
134,d,poe,marieroget,1
135,d,poe,ruemorgue,1
184,g,poe,pitandpendulum,1
185,g,poe,reddeath,1
186,g,poe,usher,1


In [65]:
get_doclist(doc_corr, 186) 

       genre   author            book  chapter         w
bag_id                                                  
186        g      poe           usher        1  1.000000
184        g      poe  pitandpendulum        1  0.460863
185        g      poe        reddeath        1  0.272976
135        d      poe       ruemorgue        1  0.239617
269        g  shelley    frankenstein       44  0.151190
257        g  shelley    frankenstein       32  0.146313
134        d      poe      marieroget        1  0.145544
278        g  shelley    frankenstein       53  0.142943
267        g  shelley    frankenstein       42  0.140324
265        g  shelley    frankenstein       40  0.121911
266        g  shelley    frankenstein       41  0.117814
264        g  shelley    frankenstein       39  0.116814
270        g  shelley    frankenstein       45  0.110083
275        g  shelley    frankenstein       50  0.109279
281        g  shelley    frankenstein       56  0.101768


In [None]:
# END