# Synopsis

As an aside, we demonstrate use of Pandas' correlation function, `corr()`, to get pairwise similarities of words and documents.

# Configuration

In [1]:
db_name = "/sfs/qumulo/qhome/sk5be/DS5559/HarryPotter.db"

# Libraries

In [2]:
import sqlite3
import pandas as pd
import numpy as np

# Pragmas

In [3]:
%matplotlib inline

In [5]:
with sqlite3.connect(db_name) as db:
    vocab = pd.read_sql('SELECT * FROM vocab', db, index_col='term_id')

vocab.head()

Unnamed: 0_level_0,term_str,n,p,port_stem,stop,df,tf_sum,tf_mean,tf_max,tfidf_sum,tfidf_mean,tfidf_max,tfth_sum,tfth_mean,tfth_max,th_sum,th_mean,th_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,''just,1,8.912283e-07,''just,0,1.0,0.000246,1e-06,0.000246,0.001876,9e-06,0.001875,7.236781e-07,3.636573e-09,7.234508e-07,0.002947,1.5e-05,0.002944
1,''professor,1,8.912283e-07,''professor,0,1.0,0.000332,2e-06,0.000332,0.002534,1.3e-05,0.002534,1.273337e-06,6.398678e-09,1.273041e-06,0.003837,1.9e-05,0.003835
2,''was,1,8.912283e-07,''wa,0,1.0,0.000465,2e-06,0.000464,0.003548,1.8e-05,0.003547,2.390064e-06,1.201037e-08,2.389668e-06,0.005145,2.6e-05,0.005143
3,'a,52,4.634387e-05,'a,0,30.0,0.015691,7.9e-05,0.002221,0.042832,0.000215,0.006063,0.00261353,1.313332e-05,0.0003699704,0.166562,0.000837,0.019579
4,'aaaaaah,1,8.912283e-07,'aaaaaah,0,1.0,0.000496,2e-06,0.000496,0.003785,1.9e-05,0.003784,2.697485e-06,1.35552e-08,2.697066e-06,0.005443,2.7e-05,0.00544


# Process

In [6]:
with sqlite3.connect(db_name) as db:
    vocab = pd.read_sql('SELECT * FROM vocab', db, index_col='term_id')
    tfidf = pd.read_sql('SELECT * FROM tfidf_small', db, index_col=['bag_id', 'term_id'])
    bags =  pd.read_sql('SELECT * FROM bag', db, index_col='bag_id')

## Expand TFIDF Matrix

The TFIDF matrix is stored in narrow mode in the database. We unstack it and convert the columns into terms to make the exercise easier.

In [7]:
TFIDF = tfidf.unstack()
TFIDF.columns = TFIDF.columns.droplevel(0)
TFIDF.columns = vocab.loc[TFIDF.columns].term_str

In [10]:
TFIDF.head()

term_str,'a,'come,'d,'did,'dumbledore,'er,'good,'how,'just,'let,...,wrapped,wrenched,wrote,yard,yell,yelled,yelling,yellow,yesterday,younger
bag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,1.750208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.290253,0.0,0.0,0.767374,0.0
1,0.0,0.0,1.235441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.686069,0.0,0.0,0.0,0.0,0.580506,0.0,0.0,0.0,0.707788
2,0.0,0.0,2.676789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.675604,0.0,0.0,0.0,0.290253,0.0,0.566459,1.534748,0.0
3,0.0,0.0,1.441348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.580506,0.0,0.0,0.0,0.0
4,0.0,0.0,1.853162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.372138,0.0,0.626755,0.0,0.0,0.0,0.0,0.566459,0.767374,0.0


## Compute Similarities

Useful discussion of the relationship between cosine similiary and correlation: 

[Brendan T. O'Connor on Cosine similarity, Pearson correlation, and OLS coefficients](https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/)

### Word-Word Comparisons

In [11]:
term_corr = TFIDF.corr()

In [12]:
term_corr.head()

term_str,'a,'come,'d,'did,'dumbledore,'er,'good,'how,'just,'let,...,wrapped,wrenched,wrote,yard,yell,yelled,yelling,yellow,yesterday,younger
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'a,1.0,0.326774,0.035933,0.345414,0.424231,0.327078,0.296025,0.449304,0.349865,0.302013,...,0.040831,0.055389,-0.055591,-0.057057,0.037726,-0.039531,-0.000275,-0.091764,0.100431,-0.1348
'come,0.326774,1.0,0.069805,0.460285,0.47415,0.448596,0.611282,0.522061,0.424987,0.477513,...,0.150609,0.176215,0.099595,-0.054896,0.068281,0.090447,0.043589,-0.057321,-0.11645,-0.130803
'd,0.035933,0.069805,1.0,0.189461,0.132064,0.088373,0.018583,0.104577,0.112088,0.015813,...,0.068951,-0.045883,-0.059218,-0.006588,0.06486,-0.039285,-0.058986,0.089363,0.070223,-0.098231
'did,0.345414,0.460285,0.189461,1.0,0.700442,0.390625,0.468463,0.554776,0.495897,0.406795,...,0.031464,0.120356,-0.018499,-0.038035,0.079179,0.01992,-0.051088,-0.100151,-0.035772,-0.103093
'dumbledore,0.424231,0.47415,0.132064,0.700442,1.0,0.4607,0.338704,0.615089,0.420307,0.41166,...,0.034112,0.132858,0.018261,-0.08488,0.134226,0.033431,0.01034,-0.110534,-0.009409,-0.067469


In [13]:
def get_termlist(df, term_str, limit=15):
    try:
        list = df[term_str].sort_values(ascending=False).head(limit).reset_index()
        print(list)
    except KeyError:
        print(term_str, 'not in vocab')

In [20]:
get_termlist(term_corr, 'harry')

harry not in vocab


In [17]:
get_termlist(term_corr, 'knowledge')

      term_str  knowledge
0    knowledge   1.000000
1        cared   0.372325
2      subject   0.355079
3   discovered   0.328046
4      perhaps   0.321031
5        'that   0.302534
6        chose   0.298926
7      neither   0.293932
8    portraits   0.285299
9       dreams   0.281030
10     'sirius   0.280732
11       known   0.268305
12   attempted   0.268217
13    evidence   0.261986
14     learned   0.251437


In [18]:
get_termlist(term_corr, 'murder')

      term_str    murder
0       murder  1.000000
1   understood  0.321534
2       indeed  0.309073
3      walking  0.285827
4     startled  0.239590
5     decision  0.232935
6      discuss  0.231606
7        grave  0.224179
8       closer  0.222892
9      village  0.217540
10  difference  0.216600
11    listened  0.214316
12     windows  0.211851
13    breathed  0.211032
14        soul  0.210213


In [19]:
get_termlist(term_corr, 'death')

death not in vocab


### Doc-Doc Comparisons

In [21]:
doc_corr = TFIDF.T.corr()

In [22]:
def get_doclist(df, doc_id, limit=15):
    try:
        list = df[doc_id].sort_values(ascending=False).head(limit)
        t = bags.loc[list.index].copy()
        t['w'] = list
        print(t)
    except KeyError:
        print(doc_id, 'not in docs')

In [25]:
get_doclist(doc_corr, 186) 

        book_num  chap_num         w
bag_id                              
186            6        24  1.000000
168            6         6  0.337443
181            6        19  0.274829
136            5         4  0.251899
98             4         4  0.251011
166            6         4  0.239514
189            6        27  0.234159
191            6        29  0.220336
116            4        22  0.213018
146            5        14  0.212689
61             3         4  0.205929
55             2        20  0.201717
36             2         1  0.199822
193            6        31  0.197856
172            6        10  0.196509


In [26]:
doc_corr

bag_id,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
bag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.141837,0.188669,0.119322,0.136947,0.121790,0.089076,0.040480,0.130508,0.072226,...,0.203513,-0.008207,0.161591,0.003824,0.049512,0.054520,-0.009519,0.039201,0.036537,0.084325
1,0.141837,1.000000,0.148974,0.054696,0.153774,0.115819,0.177364,0.166011,0.171491,0.134760,...,0.104016,-0.017952,0.086448,-0.029190,0.073833,-0.050228,-0.010890,-0.020085,0.018688,0.073094
2,0.188669,0.148974,1.000000,0.164834,0.203024,0.138258,0.022630,0.121066,0.149762,0.166102,...,0.086658,-0.012979,0.102751,-0.038960,0.002276,0.017563,-0.018264,-0.048576,-0.026768,0.012042
3,0.119322,0.054696,0.164834,1.000000,0.569806,0.050384,0.029903,0.154142,0.061768,0.070392,...,0.061018,-0.032525,0.024247,0.087588,0.009523,-0.026612,-0.015601,-0.012115,0.010593,0.024647
4,0.136947,0.153774,0.203024,0.569806,1.000000,0.226298,0.084835,0.176183,0.115519,0.113850,...,0.116995,0.031061,0.125509,0.079475,0.064359,-0.013380,0.002383,-0.013984,0.013647,0.068502
5,0.121790,0.115819,0.138258,0.050384,0.226298,1.000000,0.186000,0.181971,0.151267,0.176770,...,0.150982,0.000840,0.154977,-0.011623,0.077322,0.101965,-0.039205,-0.006850,0.072173,0.255389
6,0.089076,0.177364,0.022630,0.029903,0.084835,0.186000,1.000000,0.298343,0.163617,0.180713,...,0.085502,0.037166,0.092742,0.069620,0.063435,-0.031243,-0.021221,-0.015560,0.079253,0.065758
7,0.040480,0.166011,0.121066,0.154142,0.176183,0.181971,0.298343,1.000000,0.177003,0.172366,...,0.096141,0.003880,0.087280,0.008462,0.034541,-0.068795,0.063608,-0.008736,0.045434,0.048675
8,0.130508,0.171491,0.149762,0.061768,0.115519,0.151267,0.163617,0.177003,1.000000,0.299784,...,0.119170,0.018958,0.174046,0.006257,0.127844,-0.068256,-0.024435,-0.020809,0.043173,0.073413
9,0.072226,0.134760,0.166102,0.070392,0.113850,0.176770,0.180713,0.172366,0.299784,1.000000,...,0.120767,0.036764,0.128959,0.057544,0.088721,-0.045010,-0.046838,-0.038589,0.068941,0.064732


In [None]:
# END