## Collection and vocabulary

In [4]:
from collection_vocabulary import Collection
import pickle
col=Collection()
with open('pickle/col.pkl', 'wb') as output:
    pickle.dump(col, output)

## Document Term Matrix
The document term matrix is obtained as a lists of lists, and then converted to a Pandas dataframe, which is stored to disk to facilitate debugging, and further experimentation.

In [None]:
doc_term_matrix=[]
for doc in col.collection:
    tf_vector =[]
    for word in col.vocabulary:
        n= col.collection[doc].count(word)
        tf_vector.append(n)
    doc_term_matrix.append(tf_vector)

In [None]:
import pandas as pd
import numpy as np
doc_term_matrix= pd.DataFrame(data=doc_term_matrix,index= col.collection.keys(),columns=col.vocabulary)
doc_term_matrix.to_pickle('pickle/doc_term_matrix.pkl')

In [None]:
doc_term_matrix.head(3) # this is how the doc term matrix looks like

In [None]:
# Sanity Check: should have dimensions 3633*29052
doc_term_matrix.shape

In [None]:
# some summary stats for our project report and a sanity check that would reveal any empty docs
doc_term_matrix.sum(axis=1).describe()

## Inverted Index

The inverted index is our unified (and in practice memory-efficient) way of representing the document term matrix that we will use in the remainder of this project.



In [None]:
inverted_index= doc_term_matrix.transpose()
inverted_index.to_pickle('inverted_index.pkl') # use later for embeddings, queries, ... 

In [None]:
# sanity check 1
# each term should occur at least once (implied by the way we construct the index), hence min>=1
inverted_index.sum(axis=1).min()