In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter,defaultdict
import pickle
from itertools import chain

------
# Combining all indexes into one

## FREQ - CORPUSINFO - TF-IDF

In [None]:
# Load inverted indexes
ii_freq_imdb = pickle.load( open( "data-imdb-invertedIndexFreq.pickle", "rb" ) )
ii_freq_tmz = pickle.load( open( "data-tmz-invertedIndexFreq.pickle", "rb" ) )
ii_freq_mw = pickle.load( open( "data-mw-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rr = pickle.load( open( "data-rr-invertedIndexFreq.pickle", "rb" ) )
ii_freq_hl = pickle.load( open( "data-hl-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rt = pickle.load( open( "data-rt-invertedIndexFreq.pickle", "rb" ) )

# Initialize final frequency index dict 
ii_freq_merged = defaultdict(Counter)

# Merge indexes
for k,v in chain(ii_freq_imdb.items(), ii_freq_tmz.items(), 
                 ii_freq_mw.items(), ii_freq_rr.items(),
                 ii_freq_hl.items(), ii_freq_rt.items()):
    
    ii_freq_merged[k].update(v)
    
# Storing the corpus info
with open('merged-invertedIndexFreq', 'wb') as handle:
    pickle.dump(ii_freq_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load corpus infos
ci_imdb = pickle.load( open( "data-imdb-corpusInfo.pickle", "rb" ) )
ci_tmz = pickle.load( open( "data-tmz-corpusInfo.pickle", "rb" ) )
ci_mw = pickle.load( open( "data-mw-corpusInfo.pickle", "rb" ) )
ci_rr = pickle.load( open( "data-rr-corpusInfo.pickle", "rb" ) )
ci_hl = pickle.load( open( "data-hl-corpusInfo.pickle", "rb" ) )
ci_rt = pickle.load( open( "data-rt-corpusInfo.pickle", "rb" ) )

# Initialize final courpusinfo dict
ci_merged = ci_imdb

# Merge indexes
ci_merged['num_docs'] += ci_tmz['num_docs'] 
for doc, length in ci_tmz['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
      
ci_merged['num_docs'] += ci_mw['num_docs'] 
for doc, length in ci_mw['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
    
ci_merged['num_docs'] += ci_rr['num_docs'] 
for doc, length in ci_rr['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
        
ci_merged['num_docs'] += ci_hl['num_docs'] 
for doc, length in ci_hl['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
    
ci_merged['num_docs'] += ci_rt['num_docs'] 
for doc, length in ci_rt['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
    
# Storing the corpus info
with open('merged-corpusInfo.pickle', 'wb') as handle:
    pickle.dump(ci_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Compute tf-idf
def tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing):
    return (doc_freq / doc_length) * math.log(n_docs_total / (1 + n_docs_containing))

# Load merged files
ii_freq_merged = pickle.load( open( "merged-invertedIndexFreq.pickle", "rb" ) )
ci_merged = pickle.load( open( "merged-corpusInfo.pickle", "rb" ) )

# Initialize final weights index dict 
ii_w_merged = defaultdict(Counter)

for w,wv in ii_freq_merged.items():
    n_docs_total = ci_merged['num_docs']
    n_docs_containing = len(wv)
    
    for d_id,dv in wv.items():
        doc_freq = dv
        doc_length = ci_merged['doc_lengths'][d_id]
        
        ii_w_merged[w][d_id] = tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing)
        
# Storing the corpus info
with open('merged-invertedIndexWeights.pickle', 'wb') as handle:
    pickle.dump(ii_w_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

## POSITIONS

In [None]:
# Load inverted indexes
ii_pos_imdb = pickle.load( open( "data-imdb-invertedIndexPos.pickle", "rb" ) )
ii_pos_tmz = pickle.load( open( "data-tmz-invertedIndexPos.pickle", "rb" ) )
ii_pos_mw = pickle.load( open( "data-mw-invertedIndexPos.pickle", "rb" ) )
ii_pos_rr = pickle.load( open( "data-rr-invertedIndexPos.pickle", "rb" ) )
ii_pos_hl = pickle.load( open( "data-hl-invertedIndexPos.pickle", "rb" ) )
ii_pos_rt = pickle.load( open( "data-rt-invertedIndexPos.pickle", "rb" ) )

# Initialize final position index dict 
ii_pos_merged = ii_pos_imdb

# Merge indexes
for w, dic in ii_pos_tmz.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_mw.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_rr.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_hl.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_rt.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
# Storing the corpus info
with open('merged-invertedIndexPos.pickle', 'wb') as handle:
    pickle.dump(ii_pos_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)