**Importing all the relevant libraries**

**Creating Custom Functions for  
1: Doing tokenization based on white space    
2: Removing punctuation from the text data   
3: Creating frequency summary by Doc_ID, Tokens  
4: Creating TF-IDF summary by Doc_ID, Tokens**



In [85]:
# Importing all the libraries
# Creating custom functions

import pandas as pd
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
import string
import re


# White space tokenizer
def tokenize(text):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    token = []
    for item in text:
        token.append(tokenizer.tokenize(item))
    return token

# Creating a function for removing the puntuation from text
def rem_punctuation(data_frame,colname):
    l3=[]
    l2=[i for i in data_frame[colname]]
    l3=tokenize(l2)

    # Removing the punctuations
    l5=[]
    l6=[]
    l7=[]
    for j in l3:
        for k in j:
            if k in string.punctuation:
                l7.append(" ")
            else:
                l5.append(k)
        l6.append(l5)
        l5=[]
    c_ls=[" ".join(i) for i in l6]
    df1=pd.DataFrame(c_ls)
    df1.columns=['Text']
    return(df1)

# Creating a function to create frequency profile for ngrams
# 'data_frame' is the imported table/csv file
# 'colname' is the column that contains the text data
# 'ngram' contains the list of ngrams we required in the data like 1,2,3...

def TF_NGRAM(data_frame,colname,ngram):
    # Tokenize the data
    DocID_ls=[]

    #Appending the Doc_ID column
    data_frame['Doc_ID']=[i for i in range(1,data_frame[colname].shape[0]+1)]


    from nltk import ngrams       
    l1=[]
    for i in data_frame['Doc_ID']:
        for k in data_frame[data_frame['Doc_ID']==i][colname]:
            for j in ngram:
                unigrams = ngrams(k.split(), j)
                for z in unigrams:
                    if j==1:
                        l1.append(z[0])
                    elif j==2:
                        l1.append(z[0] + " , " +z[1])
                    elif j==3:
                        l1.append(z[0] + " , " +z[1]+" , " +z[2])
                    elif j==4:
                        l1.append(z[0] + " , " +z[1]+" , " +z[2] +" , " +z[3])

                    DocID_ls.append(i)

    # Creating a Data Frame out of it

    s1=pd.Series(l1)
    s2=pd.Series(DocID_ls)
    df1=pd.DataFrame(s2)
    df1['s1']=s1
    df1.columns=['Doc_ID','Tokens']
    df1
    return(df1)


# Function for TF-IDF
def TF_IDF(data_frame,colname):
    # Tokenize the data
    l1 = tokenize(data_frame[colname])
    
     # Getting the Doc_ID
    temp_ID=[l1.index(i) for i in l1 for j in i]
    Doc_ID=[j+1 for j in temp_ID]
    
    # Converting l1 into a data frame
    l2=[pd.DataFrame(pd.Series(i)) for i in l1]
    df=pd.concat(l2)
    df['Doc_ID']=Doc_ID
    
    # Removing the punctuation
    import string
    from string import punctuation
    pos=[i not in string.punctuation for i in df[0]]
    df2=df[pos]
    
    # Calculating the TF for each word with the Document
    df2.columns=['Tokens','Doc_ID']
    g=df2.groupby(['Doc_ID','Tokens'])
    df3=g.size().reset_index()
    df3.columns=['Doc_ID','Tokens','Frequency']
    
    # Calculating the TF component
    grp=df3.groupby(['Doc_ID','Tokens']).agg({'Frequency':'sum'})
    df4=grp.groupby(level=0).apply(lambda x: x/x.sum()).reset_index()
    df4
    
    # Total Doc present
    Total_Doc=len(l1)
    
    # Calculate number of Docs in which Token is present
    present_l0=[]
    for i in df4['Tokens']:
        pres_l0=i in list(df4[df4['Doc_ID']==1]['Tokens']), i in list(df4[df4['Doc_ID']==2]['Tokens'])
        pres_l1=list(pres_l0)
        present_l0.append(sum(pres_l0))
    df4['Doc_Present']=present_l0
    
    # Calculating the IDF component
    # IDF =log(Total Number of Documents/Total documents in which the token is present)
    # IDF =-log(Total documents in which the token is present/Total Number of Documents)
    import math
    IDF=[round(math.log(Total_Doc/i),2) for i in df4['Doc_Present']]
    df4['IDF']=IDF

    # TF-IDF will be nothing but multiplication of Frequency and IDF columns
    TF_IDF=[df4['Frequency'].iloc[i]*df4['IDF'].iloc[i] for i in range(0,df4.shape[0])]
    df4['TF-IDF']=TF_IDF

    return(df4)


**Testing it out on a small data set**

In [76]:
# Creating a short data frame
text = ["money gives material benefits",
        "material thing is required in life . Without any material benefits we cant live our life fully"]

df_TFIDF=pd.DataFrame(text,columns=['Text_Data'])
df_TFIDF.head()

Unnamed: 0,Text_Data
0,money gives material benefits
1,material thing is required in life . Without a...


In [77]:
# Removing punctuations
df1=rem_punctuation(df_TFIDF,'Text_Data')

# creating frequency profile of df1 using ngrams
df2=TF_NGRAM(df1,'Text',ngram=[1,2,3])
df2.head(10)

df2['Tokens']=df2['Tokens'].apply(lambda x: re.sub(',','',x))
df2.head(10)

 


Unnamed: 0,Doc_ID,Tokens
0,1,money
1,1,gives
2,1,material
3,1,benefits
4,1,money gives
5,1,gives material
6,1,material benefits
7,1,money gives material
8,1,gives material benefits
9,2,material


In [78]:
# Creating a Data Frame having the following columns
# Doc_ID
# Tokens
# Frequency


df2['Frequency']=1
df3=df2.groupby(['Doc_ID','Tokens'])['Frequency'].sum().reset_index()
df3.head()

# Taking 'material','money','benefits' as Features
token=['material','money','benefits']

pos=[i in token for i in df3['Tokens']] 
df4=df3[pos]
df4

Unnamed: 0,Doc_ID,Tokens,Frequency
0,1,benefits,1
4,1,material,1
6,1,money,1
15,2,benefits,1
35,2,material,2


In [79]:
# Pivotting df3 
df4.pivot(index='Doc_ID',columns='Tokens',values='Frequency').fillna(0)


Tokens,benefits,material,money
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,1.0,1.0
2,1.0,2.0,0.0


In [80]:
# Creating DTM using TF-IDF
# Taking 'material','money','benefits' as Features

df_TF_IDF=TF_IDF(df_TFIDF,'Text_Data')
df_TF_IDF

Unnamed: 0,Doc_ID,Tokens,Frequency,Doc_Present,IDF,TF-IDF
0,1,benefits,0.25,2,0.0,0.0
1,1,gives,0.25,1,0.69,0.1725
2,1,material,0.25,2,0.0,0.0
3,1,money,0.25,1,0.69,0.1725
4,2,Without,0.0625,1,0.69,0.043125
5,2,any,0.0625,1,0.69,0.043125
6,2,benefits,0.0625,2,0.0,0.0
7,2,cant,0.0625,1,0.69,0.043125
8,2,fully,0.0625,1,0.69,0.043125
9,2,in,0.0625,1,0.69,0.043125


In [81]:
# Creating DTM using 'gives','money' ad 'life' as Features
df_TF_IDF2=df_TF_IDF[['Doc_ID','Tokens','TF-IDF']]
df_TF_IDF2.head()

Unnamed: 0,Doc_ID,Tokens,TF-IDF
0,1,benefits,0.0
1,1,gives,0.1725
2,1,material,0.0
3,1,money,0.1725
4,2,Without,0.043125


In [82]:
# Pivoting df_TF_IDF2 to create DTM
token=['gives','money','life']

pos=[i in token for i in df_TF_IDF2['Tokens']] 
df_TF_IDF3=df_TF_IDF2[pos]

df_TF_IDF_DTM=df_TF_IDF3.pivot(index='Doc_ID',columns='Tokens',values='TF-IDF')
df_TF_IDF_DTM.fillna(0)

Tokens,gives,life,money
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.1725,0.0,0.1725
2,0.0,0.08625,0.0


**Testing it out on a Data Frame with huge number of records (~100k)**

In [87]:
# Lets test it out on a data frame having 100k records
dir="C:\\R Codes\\Text Mining R"
df=pd.read_csv(dir+"\\text.csv")

df.shape # has 145152 records
df.columns

Index(['Description'], dtype='object')

In [89]:
# Lets try to create a DTM matrix out of df

# Removing punctuations
df1=rem_punctuation(df,'Description')

# creating frequency profile of df1 using ngrams
df2=TF_NGRAM(df1,'Text',ngram=[1,2,3])


In [58]:
df2.head(10)

Unnamed: 0,Doc_ID,Tokens
0,1,money
1,1,gives
2,1,material
3,1,benefits
4,1,"money , gives"
5,1,"gives , material"
6,1,"material , benefits"
7,1,"money , gives , material"
8,1,"gives , material , benefits"
9,2,material


In [90]:
# Removing the commas from tokens
df2['Tokens']=df2['Tokens'].apply(lambda x: re.sub(',','',x))


In [91]:
# Creating a Data Frame having the following columns
# Doc_ID
# Tokens
# Frequency


df2['Frequency']=1
df3=df2.groupby(['Doc_ID','Tokens'])['Frequency'].sum().reset_index()
df3.head()

# Taking 'material','money','benefits' as Features
token=['material','money','benefits']

pos=[i in token for i in df3['Tokens']] 
df4=df3[pos]

# Pivotting df3 
df5=df4.pivot(index='Doc_ID',columns='Tokens',values='Frequency').fillna(0)
df5.head(20)

Tokens,benefits,material,money
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,1.0,1.0
2,0.0,2.0,0.0
4,1.0,1.0,1.0
5,0.0,2.0,0.0
7,1.0,1.0,1.0
8,0.0,2.0,0.0
10,1.0,1.0,1.0
11,0.0,2.0,0.0
13,1.0,1.0,1.0
14,0.0,2.0,0.0
