In [1]:
# Importing the libraries
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

In [2]:
# Modifying the WhitespaceTokenizer function so that result is stored in a list
def tokenize(text):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    token = []
    for item in text:
        token.append(tokenizer.tokenize(item))
    return token

In [4]:
# your corpus
text = ["money gives material benefits",
        "material thing is required in life . Without any material benefits we cant live our life fully"]

text

['money gives material benefits',
 'material thing is required in life . Without any material benefits we cant live our life fully']

In [5]:
l1 = tokenize(text)
l1

[['money', 'gives', 'material', 'benefits'],
 ['material',
  'thing',
  'is',
  'required',
  'in',
  'life',
  '.',
  'Without',
  'any',
  'material',
  'benefits',
  'we',
  'cant',
  'live',
  'our',
  'life',
  'fully']]

In [7]:
# Getting the document number

temp_ID=[l1.index(i) for i in l1 for j in i]
Doc_ID=[j+1 for j in temp_ID]
Doc_ID

[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [8]:
# We need to convert this into a pandas data frame
l2=[pd.DataFrame(pd.Series(i)) for i in l1]
df=pd.concat(l2)
df

Unnamed: 0,0
0,money
1,gives
2,material
3,benefits
0,material
1,thing
2,is
3,required
4,in
5,life


In [9]:
# Create the column for 'Doc_ID'
counter=df.index
df['Doc_ID']=Doc_ID
df

Unnamed: 0,0,Doc_ID
0,money,1
1,gives,1
2,material,1
3,benefits,1
0,material,2
1,thing,2
2,is,2
3,required,2
4,in,2
5,life,2


In [11]:
df.columns=['Tokens','Doc_ID']
df.head()

Unnamed: 0,Tokens,Doc_ID
0,money,1
1,gives,1
2,material,1
3,benefits,1
0,material,2


In [13]:
# Removing the punctuation
import string
from string import punctuation
res = []

pos=[i not in string.punctuation for i in df['Tokens'] ]
pos # pos has positions where there are no punctuation
# We need to have only those records in df where pos is equal to True

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [17]:
# filtering out the punctuations
df2=df[pos]
df2

Unnamed: 0,Tokens,Doc_ID
0,money,1
1,gives,1
2,material,1
3,benefits,1
0,material,2
1,thing,2
2,is,2
3,required,2
4,in,2
5,life,2


In [18]:
# We have got the summary in the form of:
# 1: Tokens in one column
# 2: Document number in another column
# Now we need to design a column to calculate TF-IDF
# lets do a head of df2 for convenience
df2.head()

Unnamed: 0,Tokens,Doc_ID
0,money,1
1,gives,1
2,material,1
3,benefits,1
0,material,2


In [19]:
# Calculating the TF for each word with the Document
g=df2.groupby(['Doc_ID','Tokens'])
df3=g.size().reset_index()
df3

Unnamed: 0,Doc_ID,Tokens,0
0,1,benefits,1
1,1,gives,1
2,1,material,1
3,1,money,1
4,2,Without,1
5,2,any,1
6,2,benefits,1
7,2,cant,1
8,2,fully,1
9,2,in,1


In [20]:
# Adding 'Frequency' column name to df3
df3.columns=['Doc_ID','Tokens','Frequency']
df3.head()

Unnamed: 0,Doc_ID,Tokens,Frequency
0,1,benefits,1
1,1,gives,1
2,1,material,1
3,1,money,1
4,2,Without,1


In [21]:
# Calculating the TF component
grp=df3.groupby(['Doc_ID','Tokens']).agg({'Frequency':'sum'})
df4=grp.groupby(level=0).apply(lambda x: x/x.sum()).reset_index()
df4

Unnamed: 0,Doc_ID,Tokens,Frequency
0,1,benefits,0.25
1,1,gives,0.25
2,1,material,0.25
3,1,money,0.25
4,2,Without,0.0625
5,2,any,0.0625
6,2,benefits,0.0625
7,2,cant,0.0625
8,2,fully,0.0625
9,2,in,0.0625


In [22]:
# Calculating the IDF component
Total_Doc=len(l1)
Total_Doc
# Total_Doc is the total number of documents in the data

2

In [23]:
# Chekcing if 'benefits' is present in Doc_ID=1
pres_l0='benefits' in list(df4[df4['Doc_ID']==1]['Tokens']), 'benefits' in list(df4[df4['Doc_ID']==2]['Tokens'])
pres_l1=list(pres_l0)
sum(pres_l0)# It means that the word 'benefit' is present in only one document out of the total 2 doc

2

In [24]:
# The above step has to be repeated for each and every row in df4['Tokens']
present_l0=[]
for i in df4['Tokens']:
    pres_l0=i in list(df4[df4['Doc_ID']==1]['Tokens']), i in list(df4[df4['Doc_ID']==2]['Tokens'])
    pres_l1=list(pres_l0)
    present_l0.append(sum(pres_l0))
    
present_l0

[2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]

In [26]:
# Adding this to df4
df4['Doc_Present']=present_l0
df4

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present
0,1,benefits,0.25,2
1,1,gives,0.25,1
2,1,material,0.25,2
3,1,money,0.25,1
4,2,Without,0.0625,1
5,2,any,0.0625,1
6,2,benefits,0.0625,2
7,2,cant,0.0625,1
8,2,fully,0.0625,1
9,2,in,0.0625,1


In [54]:
# Calculating the IDF component
# IDF =log(Total Number of Documents/Total documents in which the token is present)
# IDF =-log(Total documents in which the token is present/Total Number of Documents)
import math
IDF=[round(math.log(Total_Doc/i),2) for i in df4['Doc_Present']]
IDF

[0.0,
 0.69,
 0.0,
 0.69,
 0.69,
 0.69,
 0.0,
 0.69,
 0.69,
 0.69,
 0.69,
 0.69,
 0.69,
 0.0,
 0.69,
 0.69,
 0.69,
 0.69]

In [56]:
df4['IDF']=IDF
df4.head()

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF
0,1,benefits,0.25,2,0.0
1,1,gives,0.25,1,0.69
2,1,material,0.25,2,0.0
3,1,money,0.25,1,0.69
4,2,Without,0.0625,1,0.69


In [65]:
# TF-IDF will be nothing but multiplication of Frequency and IDF columns
TF_IDF=[df4['Frequency'].iloc[i]*df4['IDF'].iloc[i] for i in range(0,df4.shape[0])]
TF_IDF

[0.0,
 0.1725,
 0.0,
 0.1725,
 0.043125,
 0.043125,
 0.0,
 0.043125,
 0.043125,
 0.043125,
 0.043125,
 0.08625,
 0.043125,
 0.0,
 0.043125,
 0.043125,
 0.043125,
 0.043125]

In [70]:
df4['TF-IDF']=TF_IDF
df4.head()

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF,TF-IDF
0,1,benefits,0.25,2,0.0,0.0
1,1,gives,0.25,1,0.69,0.1725
2,1,material,0.25,2,0.0,0.0
3,1,money,0.25,1,0.69,0.1725
4,2,Without,0.0625,1,0.69,0.043125


In [72]:
# Lets sort df4 using TF-IDF
df4.sort_values('TF-IDF',ascending=False)
# Words like gives, money, life,thing are important

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF,TF-IDF
1,1,gives,0.25,1,0.69,0.1725
3,1,money,0.25,1,0.69,0.1725
11,2,life,0.125,1,0.69,0.08625
9,2,in,0.0625,1,0.69,0.043125
16,2,thing,0.0625,1,0.69,0.043125
15,2,required,0.0625,1,0.69,0.043125
14,2,our,0.0625,1,0.69,0.043125
12,2,live,0.0625,1,0.69,0.043125
10,2,is,0.0625,1,0.69,0.043125
17,2,we,0.0625,1,0.69,0.043125


****Lets create a function that does this****

In [77]:
# The function should take the column containing text data
text = ["money gives material benefits",
        "material thing is required in life . Without any material benefits we cant live our life fully"]

df_TFIDF=pd.DataFrame(text,columns=['Text_Data'])
df_TFIDF.head()

Unnamed: 0,Text_Data
0,money gives material benefits
1,material thing is required in life . Without a...


In [167]:
def TF_IDF(data_frame,colname):
    # Tokenize the data
    l1 = tokenize(data_frame[colname])
    
     # Getting the Doc_ID
    temp_ID=[l1.index(i) for i in l1 for j in i]
    Doc_ID=[j+1 for j in temp_ID]
    
    # Converting l1 into a data frame
    l2=[pd.DataFrame(pd.Series(i)) for i in l1]
    df=pd.concat(l2)
    df['Doc_ID']=Doc_ID
    
    # Removing the punctuation
    import string
    from string import punctuation
    pos=[i not in string.punctuation for i in df[0]]
    df2=df[pos]
    
    # Calculating the TF for each word with the Document
    df2.columns=['Tokens','Doc_ID']
    g=df2.groupby(['Doc_ID','Tokens'])
    df3=g.size().reset_index()
    df3.columns=['Doc_ID','Tokens','Frequency']
    
    # Calculating the TF component
    grp=df3.groupby(['Doc_ID','Tokens']).agg({'Frequency':'sum'})
    df4=grp.groupby(level=0).apply(lambda x: x/x.sum()).reset_index()
    df4
    
    # Total Doc present
    Total_Doc=len(l1)
    
    # Calculate number of Docs in which Token is present
    present_l0=[]
    for i in df4['Tokens']:
        pres_l0=i in list(df4[df4['Doc_ID']==1]['Tokens']), i in list(df4[df4['Doc_ID']==2]['Tokens'])
        pres_l1=list(pres_l0)
        present_l0.append(sum(pres_l0))
    df4['Doc_Present']=present_l0
    
    # Calculating the IDF component
    # IDF =log(Total Number of Documents/Total documents in which the token is present)
    # IDF =-log(Total documents in which the token is present/Total Number of Documents)
    import math
    IDF=[round(math.log(Total_Doc/i),2) for i in df4['Doc_Present']]
    df4['IDF']=IDF

    # TF-IDF will be nothing but multiplication of Frequency and IDF columns
    TF_IDF=[df4['Frequency'].iloc[i]*df4['IDF'].iloc[i] for i in range(0,df4.shape[0])]
    df4['TF-IDF']=TF_IDF

    return(df4)

TF_IDF(df_TFIDF,'Text_Data')

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF,TF-IDF
0,1,benefits,0.25,2,0.0,0.0
1,1,gives,0.25,1,0.69,0.1725
2,1,material,0.25,2,0.0,0.0
3,1,money,0.25,1,0.69,0.1725
4,2,Without,0.0625,1,0.69,0.043125
5,2,any,0.0625,1,0.69,0.043125
6,2,benefits,0.0625,2,0.0,0.0
7,2,cant,0.0625,1,0.69,0.043125
8,2,fully,0.0625,1,0.69,0.043125
9,2,in,0.0625,1,0.69,0.043125


In [169]:
# Lets try it on some other data
text = ["Virat Kohli is Indian Cricket Team Captain",
        "Virat is an excellent batsman . He plays really well for the Indian Team"]

df_TFIDF=pd.DataFrame(text,columns=['Text_Data'])
df_TFIDF.head()

Unnamed: 0,Text_Data
0,Virat Kohli is Indian Cricket Team Captain
1,Virat is an excellent batsman . He plays reall...


In [170]:
# Applying the TF_IDF function
TF_IDF(df_TFIDF,'Text_Data')

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF,TF-IDF
0,1,Captain,0.142857,1,0.69,0.098571
1,1,Cricket,0.142857,1,0.69,0.098571
2,1,Indian,0.142857,2,0.0,0.0
3,1,Kohli,0.142857,1,0.69,0.098571
4,1,Team,0.142857,2,0.0,0.0
5,1,Virat,0.142857,2,0.0,0.0
6,1,is,0.142857,2,0.0,0.0
7,2,He,0.076923,1,0.69,0.053077
8,2,Indian,0.076923,2,0.0,0.0
9,2,Team,0.076923,2,0.0,0.0
