In [6]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import check_pairwise_arrays
from sklearn.preprocessing import normalize

%run preprocessing_ti.ipynb

#uses index based on order of documents.
#code by Jonas, based on Minying

In [10]:
#create inverted index
def createInvertedIndex(preprocessedCollection:pd.DataFrame, load = False, safe = True, path = "invertedIndex_ori.psave"):
    '''
    builds up the inverted index. Based on Minying's code, making it work with Hechen's new preprocessing and therefore probably inefficient, but works.
    This inverted index is based on the implicit ordering of the documents.
    Terms within the index are not sorted alphabetically, but this should not harm results
    
    preprocessedCollection: The collection as a dataframe preprocessed by preprocessing_ori()
    load: if false, inverted index is created, else it is loaded from path
    safe: if false, inverted index is not safed to file
    path: where to safe to/load from'''
    if(load):
        with open(path,"rb") as f:
            inverted_index = pickle.load(f)
    else:
        print('get the set of all words')
        preprocessed_set = set()
        for i in range(len(preprocessedCollection)):
            preprocessed_set.update(preprocessedCollection['text'][i])
        print('create the inverted index')
        inverted_index={}
        counter = 0
        for i in preprocessed_set: 
            index_list=[]
            if (counter % 100 == 0):
                print('this is term number ',counter, ' of ',len(preprocessed_set))
            for j in range(len(preprocessedCollection)):                
                if(i in preprocessedCollection['text'][j]):
                    count_Number=preprocessedCollection['text'][j].count(i)
                    index_list.append([j,count_Number])
            inverted_index[i]=index_list
            counter += 1
        if(safe):
            with open(path,"wb") as f:
                pickle.dump(inverted_index,f)
    return inverted_index

In [None]:
#inserted another save command
## f=open("inverted_index.txt","wb")
## pickle.dump(inverted_index,f)
## f.close()

In [None]:
def preprocessing_ori(collection: pd.DataFrame, 
                      stop_words_file_path='../data/nfcorpus/raw/stopwords.large', 
                      rare_words_file_path='../data/rare_tokens.txt'):
    '''
    modifies collection in place.
    preprocesses every document and replaces it by a list of non-unique words instead of the strings
    
    collection: The collection as provided by docs2df.ipynb
    stop_words_file_path: see documentation of Preprocessor
    stop_words_file_path: see documentation of Preprocessor
    '''
    #init the preprocessor
    p = Preprocessor(stop_words_file_path,rare_words_file_path)
    #preprocess every document
    collection['text'].apply(p.preprocess_line)
    #split it to get a non-unique list of words instead of a complete string
    for i in range(len(collection)):
        collection['text'][i] = collection['text'][i].split()


In [None]:
def createIDF(inverted_index,NumberOfDocs):
    idf = {}
    for a,b in inverted_index.items():
        idf[a]=np.log(NumberOfDocs/len(b))
    return idf

In [None]:
#my own implementation using a bit of optimisation. 
#Has same name as sklearn version, as it relies on their architecture and therefore replaces more smoothly in the code
def cosine_similarity(V,U): #myOwnCosineSimilarity(V,U):
    '''computes cosine similarity of two dense matrices V,U.'''
    V,U = check_pairwise_arrays(V,U)
    # make sure, dimensions are correct.
    #norm everything
    V_normalised = normalize(V,copy=True)
    U_normalised = normalize(U,copy=True)
    #compute the dot-product ignoring optimisations for sparse matrices.
    return np.dot(V_normalised,U_normalised.T)

In [None]:
def termFrequencies(doc):
    '''outputs the bag of words representation of the document first and then the term frequency vector
    '''
    f = np.unique(doc,return_counts=True)
    max_f = np.max(f[1])
    tf = 1+np.log(f[1])
    tf = tf/(1 + np.log(max_f))
    return (f[0],tf)