# TFIDF

In [1]:
import pandas as pd
import numpy as np
import pickle
#We need this line to find the collection_vocabulary.py here, else we cannot load the col.pkl object
import sys
sys.path.append('../0_Collection_and_Inverted_Index/')
with open('../0_Collection_and_Inverted_Index/pickle/col.pkl', 'rb') as input:
    col = pickle.load(input)
inverted_index = pd.read_pickle('../0_Collection_and_Inverted_Index/pickle/inverted_index.pkl')

### IDF

In [2]:
df=(inverted_index>0).sum(axis=1)
raw_idf=(col.collection_size/df)
raw_idf.tail()

zucchini               3633.0
zugesetztem            3633.0
zusatzstoffe-online    3633.0
zygote                 3633.0
zymography             1816.5
dtype: float64

In [3]:
idf= np.log10(raw_idf) #aka log of raw_idf
idf.to_pickle('pickle/idf.pkl') #use the global idf scores for queries later
idf.tail()

zucchini               3.560265
zugesetztem            3.560265
zusatzstoffe-online    3.560265
zygote                 3.560265
zymography             3.259235
dtype: float64

In [4]:
# Sanity check: max tf score should be equal to number of docs in collection...
raw_idf.max().max()==3633

True

In [5]:
# Sanity check: ... and max idf score should be substantially lower
idf.max().max()

3.5602653978627146

### TF
Raw term frequency is what we obtain when we look columnwise at the  *inverted_index* dataframe.
As discussed in the lecture, we will normalize this frequency by dividing with the raw frequency of the most frequent term in each document. Next, we then take the logarithm (any logarithm will do the job) since we assume that relevance does not increase linearly with term frequency.

In [6]:
# nominator part
nominator=inverted_index.mask(inverted_index!=0,other=np.log10(inverted_index)+1)
nominator.shape

  


(29052, 3633)

In [11]:
# denominator part
most_frequent_term=inverted_index.max(axis=0) # determine most frequent term in each doc
denominator= np.log10(most_frequent_term)
denominator+=1
denominator.shape

(3633,)

In [12]:
#sanity check, there shouldn't be any zeros
denominator.min()

1.0

In [13]:
#tf
tf=nominator.div(denominator, axis=1)

### TFIDF
Bringing the pieces together.

In [14]:
tfidf= tf.mul(idf, axis=0) # we multiply the tf scores in every doc with the corresponding idf scores
tfidf.to_pickle('pickle/tfidf.pkl')
tfidf.shape

(29052, 3633)

In [16]:
tfidf.describe()

Unnamed: 0,MED-10,MED-14,MED-118,MED-301,MED-306,MED-329,MED-330,MED-332,MED-334,MED-335,...,MED-938,MED-939,MED-940,MED-892,MED-906,MED-917,MED-941,MED-942,MED-952,MED-961
count,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,...,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0,29052.0
mean,0.002354,0.002171,0.001638,0.003018,0.003187,0.00379,0.002481,0.003265,0.002747,0.003261,...,0.002941,0.003522,0.003612,0.002682,0.002665,0.002355,0.003051,0.002124,0.002002,0.002293
std,0.046716,0.044176,0.038856,0.064724,0.058811,0.072123,0.051291,0.057346,0.055517,0.064487,...,0.06083,0.062996,0.064073,0.052539,0.061833,0.056672,0.061891,0.054927,0.050109,0.045917
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.041651,1.760288,1.830837,3.20769,2.150165,2.726365,2.329816,2.329816,2.8923,3.083144,...,2.42432,3.478811,2.285474,3.053258,3.560265,2.169824,3.083144,2.789468,3.259235,1.727563
