In [121]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter,defaultdict
import pickle
from itertools import chain

----
# Creating inverted indexes for term frequency and term positions

In [122]:
def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [123]:
# YOU ONLY HAVE TO CHANGE THIS BLOCK OF CODE

# Filename
source = 'data-mw.json' # CHANGE THIS

# Open json file 
with open(source) as f:
    data = json.load(f)

# Make dataframe from data file 
df = pd.DataFrame(data) # MAKE SURE YOUR DATAFRAME HAS A COLUMN CALLED 'url'

# Create small subset for debugging (can be commented out for final run)
#df = df.head(100)

# Tokenize content, remove stopwords and stem
# Note : the following line includes the 'title' and 'content' column
df['tokenized'] = df.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1) # CHANGE THIS


In [125]:
# Create inverted index
invertedIndexFreq = defaultdict(Counter)
invertedIndexPos = defaultdict(dict)
corpusInfo = defaultdict(dict)
corpusInfo['num_docs'] = df.shape[0]

for index, row in df.iterrows():
    corpusInfo['doc_lengths'][row['url']] = len(row['tokenized'])
        
    for w in row['tokenized']:
        invertedIndexFreq[w][row['url']]+=1
        invertedIndexPos[w][row['url']] = [i for i, j in enumerate(row['tokenized']) if w == j]

In [126]:
# Storing the inverted index with frequences
with open(source[:-5]+'-invertedIndexFreq.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [127]:
# Storing the inverted index with positions
with open(source[:-5]+'-invertedIndexPos.pickle', 'wb') as handle:
    pickle.dump(invertedIndexPos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [128]:
# Storing the corpus info
with open(source[:-5]+'-corpusInfo.pickle', 'wb') as handle:
    pickle.dump(corpusInfo, handle, protocol=pickle.HIGHEST_PROTOCOL)

------
# Combining all indexes into one


### FREQUENCY INDEX

In [51]:
# Load inverted indexes
ii_freq_imdb = pickle.load( open( "data-imdb-invertedIndexFreq.pickle", "rb" ) )
ii_freq_tmz = pickle.load( open( "data-tmz-invertedIndexFreq.pickle", "rb" ) )
ii_freq_mw = pickle.load( open( "data-mw-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rr = pickle.load( open( "data-rr-invertedIndexFreq.pickle", "rb" ) )
ii_freq_hl = pickle.load( open( "data-hl-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rt = pickle.load( open( "data-rt-invertedIndexFreq.pickle", "rb" ) )

# Initialize final frequency index dict 
ii_freq_merged = defaultdict(Counter)

# Merge indexes
for k,v in chain(ii_freq_imdb.items(), ii_freq_tmz.items(), 
                 ii_freq_mw.items(), ii_freq_rr.items(),
                 ii_freq_hl.items(), ii_freq_rt.items()):
    
    ii_freq_merged[k].update(v)

In [52]:
# Storing the corpus info
with open('merged-invertedIndexFreq', 'wb') as handle:
    pickle.dump(ii_freq_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

### POSITION INDEX

In [87]:
# Load inverted indexes
ii_pos_imdb = pickle.load( open( "data-imdb-invertedIndexPos.pickle", "rb" ) )
ii_pos_tmz = pickle.load( open( "data-tmz-invertedIndexPos.pickle", "rb" ) )
ii_pos_mw = pickle.load( open( "data-mw-invertedIndexPos.pickle", "rb" ) )
ii_pos_rr = pickle.load( open( "data-rr-invertedIndexPos.pickle", "rb" ) )
ii_pos_hl = pickle.load( open( "data-hl-invertedIndexPos.pickle", "rb" ) )
ii_pos_rt = pickle.load( open( "data-rt-invertedIndexPos.pickle", "rb" ) )

# Initialize final position index dict 
ii_pos_merged = ii_pos_imdb

# Merge indexes
for w, dic in ii_pos_tmz.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_mw.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_rr.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_hl.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis
        
for w, dic in ii_pos_rt.items():
    for doc, lis in dic.items():
        ii_pos_merged[w][doc] = lis

In [73]:
# Storing the corpus info
with open('merged-invertedIndexPos.pickle', 'wb') as handle:
    pickle.dump(ii_pos_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

### CORPUS INFO

In [100]:
# Load corpus infos
ci_imdb = pickle.load( open( "data-imdb-corpusInfo.pickle", "rb" ) )
ci_tmz = pickle.load( open( "data-tmz-corpusInfo.pickle", "rb" ) )
ci_mw = pickle.load( open( "data-mw-corpusInfo.pickle", "rb" ) )
ci_rr = pickle.load( open( "data-rr-corpusInfo.pickle", "rb" ) )
ci_hl = pickle.load( open( "data-hl-corpusInfo.pickle", "rb" ) )
ci_rt = pickle.load( open( "data-rt-corpusInfo.pickle", "rb" ) )

# Initialize final courpusinfo dict
ci_merged = ci_imdb

# Merge indexes
ci_merged['num_docs'] += ci_tmz['num_docs'] 
for doc, length in ci_tmz['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
      
ci_merged['num_docs'] += ci_mw['num_docs'] 
for doc, length in ci_mw['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
    
ci_merged['num_docs'] += ci_rr['num_docs'] 
for doc, length in ci_rr['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
        
ci_merged['num_docs'] += ci_hl['num_docs'] 
for doc, length in ci_hl['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length
    
ci_merged['num_docs'] += ci_rt['num_docs'] 
for doc, length in ci_rt['doc_lengths'].items():
    ci_merged['doc_lengths'][doc] = length

In [101]:
# Storing the corpus info
with open('merged-corpusInfo.pickle', 'wb') as handle:
    pickle.dump(ci_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

------
# Computing TF-IDF
We can only compute the tf-idf if we have the word frequency in the WHOLE corpus (ALL documents), thus when all sources are combined.


In [102]:
# Compute tf-idf
def tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing):
    return (doc_freq / doc_length) * math.log(n_docs_total / (1 + n_docs_containing))

# Load merged files
ii_freq_merged = pickle.load( open( "merged-invertedIndexFreq.pickle", "rb" ) )
ci_merged = pickle.load( open( "merged-corpusInfo.pickle", "rb" ) )

# Initialize final weights index dict 
ii_w_merged = defaultdict(Counter)

for w,wv in ii_freq_merged.items():
    n_docs_total = ci_merged['num_docs']
    n_docs_containing = len(wv)
    
    for d_id,dv in wv.items():
        doc_freq = dv
        doc_length = ci_merged['doc_lengths'][d_id]
        
        ii_w_merged[w][d_id] = tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing)

In [104]:
# Storing the corpus info
with open('merged-invertedIndexWeights.pickle', 'wb') as handle:
    pickle.dump(ii_w_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)