In [2]:
import pandas as pd
import re
import numpy

In [15]:
#create an unsorted posting list from a collection file (step outlined in lecture chapter 2.3.2.2 )
def create_postings_from_file(filename, optional_query=None):
    
    #first read lines in given file one by one
    fileHandler = open(filename, "r")
    listOfLines = fileHandler.readlines()
    fileHandler.close()

    #option to pass a query string, which gets evaluated with respect to document table
    if type(optional_query) == str:
        optional_query = "query: " + optional_query
        listOfLines.append(optional_query)

    #split them after colon (just an index in our case) and by spaces (we dont need spaces), make array out of each line
    term_list = [(re.split(": ", line)[1]).split() for line in listOfLines]
    
    collection_sz = len(listOfLines)

    #now we have a structure that is easy to work with

    #generate postings
    posting_list = []
    idx = 1
    
    for doc in term_list:
        for term in doc:
            posting_list.append((term, idx))
        idx += 1

    return [posting_list, collection_sz]

In [4]:
# sort posting list alphabetically (step outlined in lecture chapter 2.3.2.3 )
def sort_postings(posting_list):
    return sorted(posting_list, key=lambda tup: tup[0])

In [5]:
# reduce the sorted list, calculate df (step outlined in lecture chapter 2.3.2.4 )
def reduce_list(posting_list_sorted):
    idx = 0
    reduced_list = []
    
    #iterate sorted postings
    while idx < len(posting_list_sorted):
        temp_idx = idx
        same_counter = 1

        doc_occurrence = [posting_list_sorted[idx][1]]

        #while there are next elements in list and next element is same as previous element (advantage of sorted list, it's enough to search "forward")
        while temp_idx+1 < len(posting_list_sorted) and posting_list_sorted[idx][0] == posting_list_sorted[temp_idx+1][0]:
            same_counter += 1
            temp_idx += 1

            doc_occurrence.append(posting_list_sorted[temp_idx][1])

        reduced_list.append((posting_list_sorted[idx][0], same_counter, doc_occurrence))
        idx += same_counter

    return reduced_list

In [6]:
#idf as in lecture chapter 5.3.3
def calc_idf(posting, nr_of_docs):
    df = len(numpy.unique(posting[2]))
    return numpy.log10(nr_of_docs/df)

In [7]:
#idf as in lecture chapter 5.2.2
def calc_log_freq_weight(tf):
    return 1 + numpy.log10(tf) if tf > 0 else 0

In [8]:
#read file, map terms to their number of occurrence
def create_dataframe(reduced_list, nr_of_docs):
    dataframe_list = []
    #names = []
    for el in reduced_list:

        tf_each_doc = numpy.ndarray.tolist(numpy.zeros(nr_of_docs, dtype=int))
        for val in el[2]:
            tf_each_doc[val-1] += 1


            doc_freq = len(numpy.unique(el[2]))
            idf = calc_idf(el, nr_of_docs)
            tf_idf_each_doc = ["{0:.4f}".format(calc_log_freq_weight(tf)*idf) for tf in tf_each_doc]



        row = [el[0], el[1], tf_each_doc, [doc_freq, "{0:.4f}".format(idf)], tf_idf_each_doc]
        dataframe_list.append(row)
        #names.append(el[0])

    df = pd.DataFrame(dataframe_list, columns = ['sorted inv. term indices', 'overall tf', 'individual tf', 'df / idf', 'individual tf-idf'])
    #df.index=names
    return df

In [9]:
#combine all of the above functions to deliver dataframe
def calculate_tfidf(file_path, optional_query=None):
    #map
    [postings, coll_sz] = create_postings_from_file(file_path, optional_query)
    sorted_postings = sort_postings(postings)

    #reduce
    reduced_list = reduce_list(sorted_postings)

    #create a dataframe out of reduced list, calculate tf and idf
    df = create_dataframe(reduced_list, coll_sz)
    df = df.rename_axis(file_path, axis=1)

    return df

In [16]:
#df = calculate_tfidf("../collection1.txt")
#df

[['the', 'test', 'was', 'to', 'test', 'the', 'test'], ['we', 'did', 'prepare', 'the', 'exam', 'to', 'test', 'the', 'exam'], ['was', 'the', 'test', 'the', 'exam'], ['the', 'exam', 'we', 'did', 'prepare', 'was', 'to', 'test', 'the', 'test'], ['we', 'were', 'successful', 'so', 'we', 'all', 'passed', 'the', 'exam']]


../collection1.txt,sorted inv. term indices,overall tf,individual tf,df / idf,individual tf-idf
0,all,1,"[0, 0, 0, 0, 1]","[1, 0.6990]","[0.0000, 0.0000, 0.0000, 0.0000, 0.6990]"
1,did,2,"[0, 1, 0, 1, 0]","[2, 0.3979]","[0.0000, 0.3979, 0.0000, 0.3979, 0.0000]"
2,exam,5,"[0, 2, 1, 1, 1]","[4, 0.0969]","[0.0000, 0.1261, 0.0969, 0.0969, 0.0969]"
3,passed,1,"[0, 0, 0, 0, 1]","[1, 0.6990]","[0.0000, 0.0000, 0.0000, 0.0000, 0.6990]"
4,prepare,2,"[0, 1, 0, 1, 0]","[2, 0.3979]","[0.0000, 0.3979, 0.0000, 0.3979, 0.0000]"
5,so,1,"[0, 0, 0, 0, 1]","[1, 0.6990]","[0.0000, 0.0000, 0.0000, 0.0000, 0.6990]"
6,successful,1,"[0, 0, 0, 0, 1]","[1, 0.6990]","[0.0000, 0.0000, 0.0000, 0.0000, 0.6990]"
7,test,7,"[3, 1, 1, 2, 0]","[4, 0.0969]","[0.1431, 0.0969, 0.0969, 0.1261, 0.0000]"
8,the,9,"[2, 2, 2, 2, 1]","[5, 0.0000]","[0.0000, 0.0000, 0.0000, 0.0000, 0.0000]"
9,to,3,"[1, 1, 0, 1, 0]","[3, 0.2218]","[0.2218, 0.2218, 0.0000, 0.2218, 0.0000]"
