# References

This notebook was derived from a copy of the M05_01_BOW_TFIDF.ipynb file provided by Dr. Alvarado as starter code.

# Metadata

```
Course:   DS 5001 
Module:   05 Lab
Topic:    BOW, Vector Spaces, and TFIDF
Author:   R.C. Alvarado
```

# Overview

In this notebook, we explore Luhn's concept of term significance in light of Zipf's Law, TFIDF, and vector space models of text. 

Recall Luhn's (1958) representation of the problem:

<img src="./data_in/luhn.png"/>

# Set Up

In [1]:
data_home = "../data"
local_lib = "../lib"
data_prefix = 'austen-melville'

In [2]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']

In [3]:
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
import sys

In [5]:
sys.path

['/Users/nkeeley/ETA/M5',
 '/Users/nkeeley/opt/anaconda3/lib/python38.zip',
 '/Users/nkeeley/opt/anaconda3/lib/python3.8',
 '/Users/nkeeley/opt/anaconda3/lib/python3.8/lib-dynload',
 '',
 '/Users/nkeeley/.local/lib/python3.8/site-packages',
 '/Users/nkeeley/opt/anaconda3/lib/python3.8/site-packages',
 '/Users/nkeeley/opt/anaconda3/lib/python3.8/site-packages/aeosa',
 '/Users/nkeeley/opt/anaconda3/lib/python3.8/site-packages/IPython/extensions',
 '/Users/nkeeley/.ipython']

In [6]:
sns.set()

# Assignment: Get Data

In [7]:
## Same code below

# Acquire Data

We grab our **analytical edition** of a selection of works from Austen and Melville.

In [8]:
LIB = pd.read_csv(f"{data_home}/output/{data_prefix}-LIB.csv").set_index(BOOKS)
CORPUS = pd.read_csv(f"{data_home}/output/{data_prefix}-CORPUS.csv").set_index(OHCO)
VOCAB = pd.read_csv(f"{data_home}/output/{data_prefix}-VOCAB.csv").set_index('term_str')
VOCAB = VOCAB.drop('cat_pos', 1) # Drop this column for readability

FileNotFoundError: [Errno 2] No such file or directory: '../data/output/austen-melville-LIB.csv'

In [9]:
LIB.head()

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
105,../data/gutenberg/austen-melville-set/AUSTEN_J...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$,83624,24
121,../data/gutenberg/austen-melville-set/AUSTEN_J...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$,77601,31
141,../data/gutenberg/austen-melville-set/AUSTEN_J...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$,160378,48
158,../data/gutenberg/austen-melville-set/AUSTEN_J...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$,160926,55
161,../data/gutenberg/austen-melville-set/AUSTEN_J...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$,119873,50


In [10]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,1,1e-06,19.919781,CD,1,0,0,0,0
1,21,1,1.1e-05,16.527464,CD,3,0,1,1,1
10,6,2,3e-06,18.334819,CD,1,0,10,10,10
100,2,3,1e-06,19.919781,CD,1,0,100,100,100
1000,2,4,1e-06,19.919781,CD,1,0,1000,1000,1000


In [89]:
CORPUS.iloc[100000:1000001]
CORPUS.index.names

FrozenList(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num'])

# Assignment: Generate BOW Function

In [12]:
## Inputs: TOKENS (CORPUS) dataframe; a choice of bag (OHCO) level

def create_BOW(tokens, level):
    
    ## Inputs
    
    bag = level
    
    ## Create BOW table from tokens table
    
    bow = tokens.groupby(bag+['term_str']).term_str.count().to_frame('n') 
    
    ## Return values
    
    return pd.DataFrame(bow)

In [13]:
## Test out the BOW function

%%time
a=create_BOW(CORPUS, SENTS)
#a.query("chap_id==3 & sent_num == 2")
#a.n.unstack()
temp=a.reset_index()
len(temp.book_id.unique())
a.shape
a.head(50)
#temp.sort_values(by=["book_id","chap_id","para_num", "sent_num"]).head(30)
a.n.unstack().shape[0]

CPU times: user 8.39 s, sys: 9.33 s, total: 17.7 s
Wall time: 21.9 s


90361

In [17]:
#a.loc[(105,1,2,0)].head()
a.count()

n    1710128
dtype: int64

In [32]:
## Inputs: BOW table, measure

def create_TFIDF(bow, tf_method):
    
    ## Generate DTCM
    
    DTCM = bow.n.unstack()
    
    ## Generate TF
    
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log10(DTCM.T + 1)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'bool':
        TF = DTCM.T.astype('bool') #.astype('int')
    TF = TF.T
    
    ## Generate IDF
    
    N=DTCM.shape[0] # The number of chosen bags
    DF=DTCM.count() # Vector containing the number of bags containing any term in vocabulary
    IDF = np.log10(N / DF)
    
    ## Compute TFIDF
    
    TFIDF=TF*IDF
    
    return TFIDF


In [33]:
test1 =create_TFIDF(a, tf_method="sum")

In [34]:
test1


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,term_str,0,1,10,100,1000,10000,10440,10800,10th,118952,...,zoroaster,zozo,zuma,zur,à,æneas,æniad,æson,æsops,ł20000
book_id,chap_id,para_num,sent_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
105,1,1,0,,,,,,,,,,,...,,,,,,,,,,
105,1,1,1,,,,,,,,,,,...,,,,,,,,,,
105,1,2,0,,,,,,,,,,,...,,,,,,,,,,
105,1,3,0,,0.232846,,,,,,,,,...,,,,,,,,,,
105,1,3,1,,0.095526,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34970,114,19,3,,,,,,,,,,,...,,,,,,,,,,
34970,114,20,0,,,,,,,,,,,...,,,,,,,,,,
34970,114,20,1,,,,,,,,,,,...,,,,,,,,,,
34970,114,21,0,,,,,,,,,,,...,,,,,,,,,,


## Question 2

In [38]:
## TFIDF mean top 20 in corpus using bag: book, max method

bow1=create_BOW(CORPUS, BOOKS)
bow1.head(30)
tfidf1=create_TFIDF(bow1, "max")

In [49]:
## Explore

result1=tfidf1.apply(np.mean, axis=0)
result1.sort_values(ascending=False).head(20).index

Index(['elinor', 'vernon', 'darcy', 'reginald', 'frederica', 'crawford',
       'elliot', 'weston', 'pierre', 'knightley', 'tilney', 'elton', 'bingley',
       'wentworth', 'courcy', 'woodhouse', 'churchhill', 'marianne',
       'babbalanja', 'mainwaring'],
      dtype='object', name='term_str')

NameError: name 'np' is not defined

## Question 3

In [51]:
## TFIDF mean top 20 in corpus using bag: paragraph, max method

bow2=create_BOW(CORPUS, PARAS)
bow2.head(30)
tfidf2=create_TFIDF(bow2, "max")

In [54]:
## Explore

result2=tfidf2.apply(np.mean, axis=0)
result2.sort_values(ascending=False).head(20).index
result2.head(20)

term_str
0         4.467786
1         1.107899
10        1.208855
100       0.520844
1000      0.992085
10000     0.624271
10440     1.489262
10800     0.487023
10th      4.166756
118952    0.203081
11th      4.166756
12        2.692202
125000    0.893557
12th      4.467786
13        0.744631
13000     1.116946
139       0.636588
1399      0.372315
13th      3.990664
140       0.248210
dtype: float64

## Question 4

In [91]:
## Filter IDs

LIB.shape # 18 books
Austen_ID=list(LIB[LIB.author=="AUSTEN, JANE"].index)
Austen_ID
Melville_ID=list(LIB[LIB.author=="MELVILLE, HERMAN"].index)
Austen_ID

[105, 121, 141, 158, 161, 946, 1212, 1342]

In [96]:
## Flatten CORPUS

CORPUS=CORPUS.reset_index()

In [97]:
## Make new corpuses

Corpus_A=CORPUS[CORPUS.book_id.isin(Austen_ID)]
Corpus_A
Corpus_B=CORPUS[CORPUS.book_id.isin(Melville_ID)]
Corpus_B

Unnamed: 0,index,book_id,chap_id,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str
780873,780873,1900,1,0,0,0,"('THE', 'DT')",DT,THE,the
780874,780874,1900,1,0,0,1,"('SEA', 'NNP')",NNP,SEA,sea
780875,780875,1900,1,0,0,2,"('LONGINGS', 'NNP')",NNP,LONGINGS,longings
780876,780876,1900,1,0,0,3,"('FOR', 'NNP')",NNP,FOR,for
780877,780877,1900,1,0,0,4,"('SHORE', 'NNP')",NNP,SHORE,shore
...,...,...,...,...,...,...,...,...,...,...
1984031,1984031,34970,114,24,0,6,"('The', 'DT')",DT,The,the
1984032,1984032,34970,114,24,0,7,"('Ambiguities,', 'NNP')",NNP,"Ambiguities,",ambiguities
1984033,1984033,34970,114,24,0,8,"('by', 'IN')",IN,by,by
1984034,1984034,34970,114,24,0,9,"('Herman', 'NNP')",NNP,Herman,herman


In [131]:
## Make corpuses narrow again

names=['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
Corpus_A=Corpus_A.drop("index", axis=1)

KeyError: "['index'] not found in axis"

In [None]:
Corpus_A=Corpus_A.set_index(names)

In [137]:
names=['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
Corpus_B=Corpus_B.drop("index", axis=1)
Corpus_B=Corpus_B.set_index(names)

In [138]:
Corpus_B

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1900,1,0,0,0,"('THE', 'DT')",DT,THE,the
1900,1,0,0,1,"('SEA', 'NNP')",NNP,SEA,sea
1900,1,0,0,2,"('LONGINGS', 'NNP')",NNP,LONGINGS,longings
1900,1,0,0,3,"('FOR', 'NNP')",NNP,FOR,for
1900,1,0,0,4,"('SHORE', 'NNP')",NNP,SHORE,shore
...,...,...,...,...,...,...,...,...
34970,114,24,0,6,"('The', 'DT')",DT,The,the
34970,114,24,0,7,"('Ambiguities,', 'NNP')",NNP,"Ambiguities,",ambiguities
34970,114,24,0,8,"('by', 'IN')",IN,by,by
34970,114,24,0,9,"('Herman', 'NNP')",NNP,Herman,herman


In [160]:
## Generate by author TFIDF

bow_A=create_BOW(Corpus_A, CHAPS)
bow_A.head(30)
tfidf_A=create_TFIDF(bow_A, "max")

## Explore

result_A=tfidf_A.apply(np.mean, axis=0)
result_A.sort_values(ascending=False).head(20).index
result_A=pd.DataFrame(result_A)
result_A=result_A.rename(columns={0:"tfidf"})
result_A
result_A=result_A.reset_index()
result_A=result_A.sort_values(by="term_str")
result_A["max_pos"]=max_pos_A.max_pos
result_A=result_A.sort_values(["tfidf"],ascending=False)
result_A[result_A.max_pos=="JJ"]

Unnamed: 0,term_str,tfidf,max_pos
13659,undismayed,0.630937,JJ
9876,precarious,0.229431,JJ
4139,dreary,0.210312,JJ
9530,perverted,0.180268,JJ
4631,eoconomical,0.148456,JJ
...,...,...,...
9443,perfidious,0.007109,JJ
13849,unpolluted,0.007109,JJ
9468,pernicious,0.007109,JJ
13908,unsimpathetic,0.007109,JJ


In [161]:
## Generate by author TFIDF

bow_B=create_BOW(Corpus_B, CHAPS)
bow_B.head(30)
tfidf_B=create_TFIDF(bow_B, "max")

## Explore

result_B=tfidf_B.apply(np.mean, axis=0)
result_B.sort_values(ascending=False).head(20).index
result_B=pd.DataFrame(result_B)
result_B=result_B.rename(columns={0:"tfidf"})
result_B
result_B=result_B.reset_index()
result_B=result_B.sort_values(by="term_str")
result_B["max_pos"]=max_pos_B.max_pos
result_B=result_B.sort_values(["tfidf"],ascending=False)
result_B[result_B.max_pos=="JJ"]

Unnamed: 0,term_str,tfidf,max_pos
19638,manchineels,0.450680,JJ
12861,forereaching,0.225340,JJ
25777,quoggy,0.225340,JJ
34437,unpolluted,0.225340,JJ
16337,incestuous,0.220186,JJ
...,...,...,...
34038,unentered,0.005051,JJ
2237,austere,0.005051,JJ
28616,serried,0.005051,JJ
33898,underbush,0.004599,JJ


In [140]:
tfidf_A

Unnamed: 0_level_0,term_str,0,1,10,10000,10th,11th,12,12th,1399,13th,...,youthful,youths,yrs,z,zeal,zealous,zealously,zephyr,zigzags,ł20000
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
105,1,,0.03585,,,,,,,,,...,0.01307,,,0.020353,,,,,,
105,2,,,,,,,,,,,...,,,,,,0.01857,,,,
105,3,,,,,,,,,,,...,,,,,0.011138,,,,,
105,4,,,,,,,,,,,...,,,,,,,,,,
105,5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,57,,,,,,,,,,,...,,,,,,,,,,
1342,58,,,,,,,,,,,...,,,,,,,,,,
1342,59,,,,,,,,,,,...,,,,,,,,,,
1342,60,,,,,,,,,,,...,,,,,,,,,,


In [121]:
## Generate max pos

max_pos_A=Corpus_A.groupby("term_str").agg({"pos":"max"})
max_pos_A.rename(columns={"pos":"max_pos"}, inplace=True)
max_pos_A=max_pos_A.reset_index()
max_pos_A=max_pos_A.sort_values(by="term_str")
max_pos_A

max_pos_B=Corpus_B.groupby("term_str").agg({"pos":"max"})
max_pos_B.rename(columns={"pos":"max_pos"}, inplace=True)
max_pos_B=max_pos_B.reset_index()
max_pos_B=max_pos_B.sort_values(by="term_str")

max_pos_B

Unnamed: 0,term_str,max_pos
0,1,NNP
1,10,CD
2,100,CD
3,1000,CD
4,10000,CD
...,...,...
36831,à,NNP
36832,æneas,NN
36833,æniad,NN
36834,æson,NN


In [129]:
## Create vocabs and merge with max_pos

VOCAB_A = Corpus_A.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_A.index.name = 'term_id'
VOCAB_A=VOCAB_A.sort_values(by="term_str")
VOCAB_A["max_pos"]=max_pos_A.max_pos
VOCAB_A

VOCAB_B = Corpus_B.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_B.index.name = 'term_id'
VOCAB_B=VOCAB_B.sort_values(by="term_str")
VOCAB_B["max_pos"]=max_pos_B.max_pos
VOCAB_B

Unnamed: 0_level_0,term_str,n,max_pos
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5675,1,18,VBN
23386,10,2,NNP
24024,100,2,NNP
24473,1000,2,VB
32142,10000,1,NNS
...,...,...,...
14101,à,5,RB
27767,æneas,1,NN
30059,æniad,1,NNP
23217,æson,2,NN


In [None]:
## Check the vocabs to see most common POS

18

## Gather POS information

In [2]:
pos_info = f'{data_home}/misc/upenn_tagset.txt'
POS = pd.DataFrame([(line.split()[0], ' '.join(line.split()[1:])) 
                    for line in open(pos_info, 'r').readlines()])
# POS.head()

NameError: name 'data_home' is not defined

In [3]:
POS.columns = ['pos_id', 'pos_def']
POS.head()

NameError: name 'POS' is not defined

In [4]:
POS = POS.set_index('pos_id')
POS['pos_group'] = POS.apply(lambda x: x.name[:2], 1)

NameError: name 'POS' is not defined

In [5]:
POS.head()

NameError: name 'POS' is not defined

In [6]:
VOCAB['max_pos_group'] =  VOCAB.max_pos.apply(lambda x: x[:2])
CORPUS['pos_group'] = CORPUS.pos.apply(lambda x: x[:2])

NameError: name 'VOCAB' is not defined

# Zipf's Law

First, we explore Zipf's Law, which may be expressed by $f \propto \frac{1}{r}$ and the constant $k =  fr$.

Essentially, it says the frequency of tokens in a language is logarithmic. It is a property of all known human languages. 

Specifically, it says the tokens can be ranked $r = 1,2,3,4, \dots ,N$ where (1) the rank is inversely proportional to the frequency, i.e. the lower the rank number, the higher the frequency, and (2) the frequency of a rank $r$ word is $1/r$ times that the most frequent word $r = 1$. So, the rank 2 word occurs half as often as the rank 1 word, the rank 3 word one-third as often, the rank 4 word one-fourth as often, and so forth. The law breaks down at around $r = 1000$.

The law turns out to need some tweaking. 

Here we try to reproduce it with our data.

In [7]:
VOCAB.n.sort_values().plot(logy=True, style='.', rot=45);

NameError: name 'VOCAB' is not defined

## Add Term Rank $r$ to `VOCAB`

In [None]:
if 'term_rank' not in VOCAB.columns:
    VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
    VOCAB.index.name = 'term_rank'
    VOCAB = VOCAB.reset_index()
    VOCAB['term_rank'] = VOCAB['term_rank'] + 1
    VOCAB = VOCAB.set_index('term_str')

In [None]:
VOCAB.term_rank.plot(logx=False, rot=45, title="Y = Rank");

## Alternate Rank

Note this the above method does not group words with the same frequency, and thus arbitrarily assigns ranks for words with the same frequency. So we try coming up with rank as a grouping feature.

In [None]:
new_rank = VOCAB.n.value_counts()\
    .sort_index(ascending=False).reset_index().reset_index()\
    .rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
    .set_index('n')

In [None]:
new_rank
VOCAB.head()

In [None]:
VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1

In [None]:
VOCAB.term_rank2.plot(logx=False, rot=45, title="Y = Rank");

## Compute Zipf's $k$

We compute $k$ using both types of rank. Note that this *should* be a constant, but it is not. This is a function of the data size (support) and the fact that Zipf's law is not perfect. 

In [None]:
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2

In [None]:
VOCAB.zipf_k.plot(style=',', rot=45);

In [None]:
VOCAB.zipf_k2.plot(style=',', rot=45);

##  Rank and N

In [None]:
px.scatter(VOCAB.reset_index(), 
           x='term_rank2', y='n', 
           log_y=False, log_x=False,
           hover_name='term_str',
           color='max_pos_group',
           height=500, width=800)

## Demo Rank Index

In [None]:
rank_index = [1, 2, 3, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400, 450, 500, 550, 600, 700, 800]
demo = VOCAB.loc[VOCAB.term_rank2.isin(rank_index), 
                 ['term_rank2', 'n', 'zipf_k2', 'max_pos']].head(50)

In [None]:
demo.style.background_gradient(cmap='YlGnBu', high=.5)

# BOW

In [None]:
bag = CHAPS

In [None]:
# count_method = 'n'      # 'c' or 'n' # n = n tokens, c = distinct token (term) count
# tf_method = 'sum'       # sum, max, log, double_norm, raw, binary
# tf_norm_k = .5          # only used for double_norm
# idf_method = 'standard' # standard, max, smooth
gradient_cmap = 'GnBu'  # YlGn, GnBu, YlGnBu; For tables; see https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html 

In [None]:
BOW = CORPUS.groupby(bag+['term_str']).term_str.count().to_frame('n') 

In [None]:
# BOW['c'] = BOW.n.astype('bool').astype('int')

In [None]:
BOW.head()

In [None]:
BOW.loc[(105,2)].sort_values('n', ascending=False).head(10)

## Document-Term Matrix

We create a document-term count matrix. Note that we can create a matrix for any of the features in BOW. Also, see how the OHCO helps us distinguish between features and observation identity.

Note, these operations are slower than using `groupby()`.

## Create Count Matrix

In [None]:
# DTCM = BOW[count_method].unstack() #.fillna(0).astype('int')

In [None]:
DTCM = BOW.n.unstack() #.fillna(0).astype('int')

In [None]:
DTCM.head(10)

# TFIDF

## Compute TF

In [None]:
tf_method = 'sum' # sum, max, log, double_norm, raw, binary

In [None]:
print('TF method:', tf_method)
if tf_method == 'sum':
    TF = DTCM.T / DTCM.T.sum()
elif tf_method == 'max':
    TF = DTCM.T / DTCM.T.max()
elif tf_method == 'log':
    TF = np.log10(DTCM.T + 1)
elif tf_method == 'raw':
    TF = DTCM.T
elif tf_method == 'bool':
    TF = DTCM.T.astype('bool') #.astype('int')
TF = TF.T

In [None]:
TF.head()

## Compute DF

In [None]:
# DF = DTCM[DTCM > 0].count()
DF = DTCM.count() # THIS WORKS IF WE KEPT NULLS IN DTCM

In [None]:
DF

## Compute IDF

In [None]:
idf_method = 'standard' # standard, max, smooth

In [None]:
N = DTCM.shape[0]

In [None]:
print('IDF method:', idf_method)
if idf_method == 'standard':
    IDF = np.log10(N / DF)
elif idf_method == 'max':
    IDF = np.log10(DF.max() / DF) 
elif idf_method == 'smooth':
    IDF = np.log10((1 + N) / (1 + DF)) + 1

In [None]:
IDF

## Compute TFIDF

In [None]:
TFIDF = TF * IDF

In [None]:
TFIDF.head()

## Move things to their places

In [None]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF

In [None]:
VOCAB.head()

In [None]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [None]:
BOW.head()

# Aggregate TFIDF

## Add aggregates to VOCAB

In [None]:
VOCAB['tfidf_mean'] = TFIDF[TFIDF > 0].mean().fillna(0) # EXPLAIN
VOCAB['tfidf_max'] = TFIDF.max()
VOCAB.head()

In [2]:
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

NameError: name 'VOCAB' is not defined

In [None]:
px.scatter(VOCAB.reset_index(), x='term_rank2', y='dfidf', color='max_pos', hover_name='term_str', height=500, width=800)

**What does `DFIDF` look like?**

## Observe results

In [None]:
my_cols = "n term_rank2 zipf_k2 max_pos tfidf_mean tfidf_max dfidf".split()

In [None]:
VOCAB[my_cols].sort_values('term_rank2', ascending=True).head(100).style.background_gradient(cmap=gradient_cmap)

In [None]:
VOCAB[my_cols].sort_values('tfidf_mean', ascending=False).head(50)\
    .style.background_gradient(cmap=gradient_cmap)

In [None]:
VOCAB[my_cols].sort_values('dfidf', ascending=False).head(50)\
    .style.background_gradient(cmap=gradient_cmap)

In [None]:
BOW.sort_values('tfidf', ascending=False).head(20).style.background_gradient(cmap=gradient_cmap, high=.5)

## More Visualizations

#### Rank and TFIDF Mean

In [None]:
px.scatter(VOCAB.reset_index(), 
           x='term_rank2', y='tfidf_mean', 
           color='max_pos', size='n_pos',
           hover_name='term_str', hover_data=['n','i'],
           log_y=True, log_x=False,
#            height=500, width=800
          )

In [None]:
px.scatter(VOCAB.reset_index(), 
           x='term_rank2', y='dfidf', 
           color='max_pos', size='n_pos',
           hover_name='term_str', hover_data=['n','i'],
#            height=500, width=800
          )

#### Show Demo Table with TFIDF

In [None]:
demo2 = VOCAB.loc[VOCAB.term_rank2.isin(rank_index), my_cols]

In [None]:
demo2.style.background_gradient(cmap='YlGnBu')

## Reduce VOCAB

In [None]:
key_col = 'dfidf'
key_quantile = .9
key_min = VOCAB[key_col].quantile(key_quantile)

In [None]:
SIGS = VOCAB.loc[VOCAB[key_col] >= key_min].sort_values(key_col, ascending=False)

In [None]:
SIGS.shape[0]

In [None]:
SIGS[my_cols].sample(100).style.background_gradient(cmap=gradient_cmap)

# Save Work

In [None]:
VOCAB.to_csv(f'{data_home}/output/{data_prefix}-VOCAB2.csv') # USED IN HW
CORPUS.to_csv(f'{data_home}/output/{data_prefix}-CORPUS2.csv')
BOW.to_csv(f'{data_home}/output/{data_prefix}-BOW.csv')
DTCM.to_csv(f'{data_home}/output/{data_prefix}-DTCM.csv')

In [None]:
TFIDF.to_csv(f'{data_home}/output/{data_prefix}-TFIDF.csv')
SIGS.to_csv(f'{data_home}/output/{data_prefix}-SIGS.csv')
WCM.to_csv(f'{data_home}/output/{data_prefix}-WCM.csv')

In [None]:
TFIDF[SIGS.index].to_csv('{}/{}-TFIDF_REDUCED.csv'.format(data_out, data_prefix))