In [2]:
import numpy as np                     # numpy
import pandas as pd                    # pandas

In [8]:
def calculate_tfidf(pathToTextFile: str) -> pd.DataFrame:
    documentCollection = []
    with open(pathToTextFile,'r') as textFile: 
        for line in textFile: #for every line in textflile
            first, termsPart = line.replace("\n", "").split(":") # get rid of the break at the end and the left part
            termsList = termsPart.split(" ") #get the terms without space
            while("" in termsList): #remove all blank ones
                termsList.remove("")
            documentCollection.append(termsList)
    
    dataFrame = pd.DataFrame(columns=['sorted inv. term indices', 'overall tf', 'individual tf', 'df / idf', 'individual tf-idf'])
    
    termList = []
    for document in documentCollection:
        for term in document:
            if term not in termList: #if term not already calculated
                #calculate individual tf
                individualTermFrequencyList = [] #list of tfs in the documents in order
                df = 0
                for doc in documentCollection:
                    individualTermFrequencyList.append(doc.count(term))
                    #calculate df
                    if doc.count(term) != 0: #df
                        df += 1
                
                #calculate overall tf
                overallTermFrequency = sum(individualTermFrequencyList) #overall tf
                
                #calculate idf
                idf = np.log10(len(documentCollection)/df)
                
                #calculate tf-idf
                individualTfIdfList = []
                for doc in documentCollection:
                    tf = doc.count(term)
                    wtd = 1 + np.log10(tf) if tf > 0 else 0
                    tfidf = round(    wtd *  idf  ,4)
                    individualTfIdfList.append(tfidf)
                
                
                newRow = pd.DataFrame.from_records([{ #add to dataframe
                            'sorted inv. term indices': term,
                            'overall tf': overallTermFrequency,
                            'individual tf': individualTermFrequencyList,
                            'df / idf': [df,round(idf,4)],
                            'individual tf-idf': individualTfIdfList
                        }])
                dataFrame = pd.concat([dataFrame, newRow])
                
                termList.append(term) #to not calculate again
    
    
    dataFrame.sort_values(by=['sorted inv. term indices'], inplace=True, ascending=True) # sort values
    
    #print(documentCollection)
    #display(dataFrame)
    
    return dataFrame
    

In [9]:
calculate_tfidf('../collection1.txt')

Unnamed: 0,sorted inv. term indices,overall tf,individual tf,df / idf,individual tf-idf
0,all,1,"[0, 0, 0, 0, 1]","[1, 0.699]","[0.0, 0.0, 0.0, 0.0, 0.699]"
0,did,2,"[0, 1, 0, 1, 0]","[2, 0.3979]","[0.0, 0.3979, 0.0, 0.3979, 0.0]"
0,exam,5,"[0, 2, 1, 1, 1]","[4, 0.0969]","[0.0, 0.1261, 0.0969, 0.0969, 0.0969]"
0,passed,1,"[0, 0, 0, 0, 1]","[1, 0.699]","[0.0, 0.0, 0.0, 0.0, 0.699]"
0,prepare,2,"[0, 1, 0, 1, 0]","[2, 0.3979]","[0.0, 0.3979, 0.0, 0.3979, 0.0]"
0,so,1,"[0, 0, 0, 0, 1]","[1, 0.699]","[0.0, 0.0, 0.0, 0.0, 0.699]"
0,successful,1,"[0, 0, 0, 0, 1]","[1, 0.699]","[0.0, 0.0, 0.0, 0.0, 0.699]"
0,test,7,"[3, 1, 1, 2, 0]","[4, 0.0969]","[0.1431, 0.0969, 0.0969, 0.1261, 0.0]"
0,the,9,"[2, 2, 2, 2, 1]","[5, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0]"
0,to,3,"[1, 1, 0, 1, 0]","[3, 0.2218]","[0.2218, 0.2218, 0.0, 0.2218, 0.0]"
