# Example computations for TFIDF using pandas data frames

In [1]:
import pandas as pd
from collections import Counter
import numpy as np

d1 = "in the new york times in" # (repeated in)
d2 = "the new york post"
d3 = "the los angeles times"

docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
N = len(docs)
docs

[['in', 'the', 'new', 'york', 'times', 'in'],
 ['the', 'new', 'york', 'post'],
 ['the', 'los', 'angeles', 'times']]

In [2]:
uniq = set(' '.join(docstrs).split())
uniq = sorted(list(uniq))
uniq

['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']

# Word vectors and term counts

First, let's do this the hard way to see the actual computations.

In [3]:
tf = pd.DataFrame(data=list(uniq), columns=['word'])
tf = tf.set_index('word')
tf

angeles
in
los
new
post
the
times
york


In [4]:
list(Counter(d1.split()).items())

[('in', 2), ('the', 1), ('new', 1), ('york', 1), ('times', 1)]

In [5]:
for i,d in enumerate(docs):
    c = Counter(d)
    doc_items = pd.DataFrame.from_records(list(c.items()),
                                          columns=['word',f'd{i+1}'])
    doc_items = doc_items.set_index('word')
    tf = tf.merge(doc_items, on='word', how='left')

tf = tf.fillna(0).astype('int')
tf

Unnamed: 0_level_0,d1,d2,d3
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
angeles,0,0,1
in,2,0,0
los,0,0,1
new,1,1,0
post,0,1,0
the,1,1,1
times,1,0,1
york,1,1,0


In [6]:
tf.T

word,angeles,in,los,new,post,the,times,york
d1,0,2,0,1,0,1,1,1
d2,0,0,0,1,1,1,0,1
d3,1,0,1,0,0,1,1,0


# Document frequencies

In [7]:
df = pd.DataFrame(data=uniq, columns=['word'])
df = df.set_index('word')
df['doc count'] = [np.sum([w in d for d in docs]) for w in uniq]
df

Unnamed: 0_level_0,doc count
word,Unnamed: 1_level_1
angeles,1
in,1
los,1
new,2
post,1
the,3
times,2
york,2


In [8]:
df['df'] = (df['doc count']+1) / (N+1)  # plus 1 for "additive smoothing"
#df['df'] = df['doc count'] / N          # use this one for no smoothing
df['idf'] = 1 / df['df']
df['log idf'] = np.log10(df['idf'])
df

Unnamed: 0_level_0,doc count,df,idf,log idf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
angeles,1,0.5,2.0,0.30103
in,1,0.5,2.0,0.30103
los,1,0.5,2.0,0.30103
new,2,0.75,1.333333,0.124939
post,1,0.5,2.0,0.30103
the,3,1.0,1.0,0.0
times,2,0.75,1.333333,0.124939
york,2,0.75,1.333333,0.124939


# Term frequencies

In [9]:
tfidf = pd.concat([df, tf], axis=1)
tfidf

Unnamed: 0_level_0,doc count,df,idf,log idf,d1,d2,d3
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
angeles,1,0.5,2.0,0.30103,0,0,1
in,1,0.5,2.0,0.30103,2,0,0
los,1,0.5,2.0,0.30103,0,0,1
new,2,0.75,1.333333,0.124939,1,1,0
post,1,0.5,2.0,0.30103,0,1,0
the,3,1.0,1.0,0.0,1,1,1
times,2,0.75,1.333333,0.124939,1,0,1
york,2,0.75,1.333333,0.124939,1,1,0


In [10]:
tfidf['d1 tf'] = tfidf['d1'] / len(docs[0])
tfidf['d2 tf'] = tfidf['d2'] / len(docs[1])
tfidf['d3 tf'] = tfidf['d3'] / len(docs[2])
tfidf

Unnamed: 0_level_0,doc count,df,idf,log idf,d1,d2,d3,d1 tf,d2 tf,d3 tf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
angeles,1,0.5,2.0,0.30103,0,0,1,0.0,0.0,0.25
in,1,0.5,2.0,0.30103,2,0,0,0.333333,0.0,0.0
los,1,0.5,2.0,0.30103,0,0,1,0.0,0.0,0.25
new,2,0.75,1.333333,0.124939,1,1,0,0.166667,0.25,0.0
post,1,0.5,2.0,0.30103,0,1,0,0.0,0.25,0.0
the,3,1.0,1.0,0.0,1,1,1,0.166667,0.25,0.25
times,2,0.75,1.333333,0.124939,1,0,1,0.166667,0.0,0.25
york,2,0.75,1.333333,0.124939,1,1,0,0.166667,0.25,0.0


# TF-IDF

In [11]:
tfidf['d1 tfidf'] = tfidf['d1 tf'] * tfidf['log idf']
tfidf['d2 tfidf'] = tfidf['d2 tf'] * tfidf['log idf']
tfidf['d3 tfidf'] = tfidf['d3 tf'] * tfidf['log idf']
tfidf

Unnamed: 0_level_0,doc count,df,idf,log idf,d1,d2,d3,d1 tf,d2 tf,d3 tf,d1 tfidf,d2 tfidf,d3 tfidf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
angeles,1,0.5,2.0,0.30103,0,0,1,0.0,0.0,0.25,0.0,0.0,0.075257
in,1,0.5,2.0,0.30103,2,0,0,0.333333,0.0,0.0,0.100343,0.0,0.0
los,1,0.5,2.0,0.30103,0,0,1,0.0,0.0,0.25,0.0,0.0,0.075257
new,2,0.75,1.333333,0.124939,1,1,0,0.166667,0.25,0.0,0.020823,0.031235,0.0
post,1,0.5,2.0,0.30103,0,1,0,0.0,0.25,0.0,0.0,0.075257,0.0
the,3,1.0,1.0,0.0,1,1,1,0.166667,0.25,0.25,0.0,0.0,0.0
times,2,0.75,1.333333,0.124939,1,0,1,0.166667,0.0,0.25,0.020823,0.0,0.031235
york,2,0.75,1.333333,0.124939,1,1,0,0.166667,0.25,0.0,0.020823,0.031235,0.0


# TextVectorizer

Ok, now get term counts the the easy way.

In [12]:
docstrs

['in the new york times in', 'the new york post', 'the los angeles times']

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(docstrs)
X

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [14]:
print(X) # maps (i,j) key to value as sparse matrix implementation

  (0, 1)	2
  (0, 5)	1
  (0, 3)	1
  (0, 7)	1
  (0, 6)	1
  (1, 5)	1
  (1, 3)	1
  (1, 7)	1
  (1, 4)	1
  (2, 5)	1
  (2, 6)	1
  (2, 2)	1
  (2, 0)	1


In [15]:
X.toarray()

array([[0, 2, 0, 1, 0, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 1],
       [1, 0, 1, 0, 0, 1, 1, 0]])

In [16]:
vectorizer.get_feature_names()

['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']

In [17]:
pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,angeles,in,los,new,post,the,times,york
0,0,2,0,1,0,1,1,1
1,0,0,0,1,1,1,0,1
2,1,0,1,0,0,1,1,0


Compare to manually computed values. Heh, we match!

In [18]:
tf.T

word,angeles,in,los,new,post,the,times,york
d1,0,2,0,1,0,1,1,1
d2,0,0,0,1,1,1,0,1
d3,1,0,1,0,0,1,1,0


In [19]:
from sklearn.feature_extraction.text import TfidfTransformer

trans = TfidfTransformer()
D = trans.fit_transform(X).toarray()
pd.DataFrame(data=D, columns=vectorizer.get_feature_names())

Unnamed: 0,angeles,in,los,new,post,the,times,york
0,0.0,0.810839,0.0,0.308332,0.0,0.239447,0.308332,0.308332
1,0.0,0.0,0.0,0.480458,0.631745,0.373119,0.0,0.480458
2,0.584483,0.0,0.584483,0.0,0.0,0.345205,0.444514,0.0


Compare to manually computed. Note that sklearn says "*Note that the idf formula above differs from the standard textbook notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]*" so we're a bit different.

In [20]:
tfidf[['d1 tfidf', 'd2 tfidf', 'd3 tfidf']].T

word,angeles,in,los,new,post,the,times,york
d1 tfidf,0.0,0.100343,0.0,0.020823,0.0,0.0,0.020823,0.020823
d2 tfidf,0.0,0.0,0.0,0.031235,0.075257,0.0,0.0,0.031235
d3 tfidf,0.075257,0.0,0.075257,0.0,0.0,0.0,0.031235,0.0
