In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter,defaultdict
import pickle

----
# Creating inverted indexes for term frequency and term positions

In [5]:
def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [6]:
# Filename
source = 'data-hl.json'

# Open json file 
with open(source) as f:
    data = json.load(f)

# Make dataframe from data file 
df = pd.DataFrame(data)

# Create small subset for debugging (can be commented out for final run)
#df = df.head(50)

# Tokenize content, remove stopwords and stem
# Note : the following line includes the 'title' and 'content' column
df['tokenized'] = df.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [7]:
# Check if everything worked as expected
#df.head()

In [None]:
# Create inverted index
invertedIndexFreq = defaultdict(Counter)
invertedIndexPos = defaultdict(Counter)

corpusInfo = defaultdict(dict)
corpusInfo['num_docs'] = df.shape[0]

for index, row in df.iterrows():
    corpusInfo['doc_lengths'][row['url']] = len(row['tokenized'])
        
    for w in row['tokenized']:
        invertedIndexFreq[w][row['url']]+=1
        invertedIndexPos[w][row['url']] = [i for i, j in enumerate(row['tokenized']) if w == j]

In [None]:
# Storing the inverted index with frequences
with open(source[:-5]+'-invertedIndexFreq.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Storing the inverted index with positions
with open(source[:-5]+'-invertedIndexPos.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Storing the corpus info
with open(source[:-5]+'-corpusInfo.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

------
# Combining all inverted indexes into one


In [None]:
# TODO

------
# Computing TF-IDF
## !! First the seperate indexes need to be combined
We can only compute the tf-idf if we have the word frequency in the WHOLE corpus (ALL documents), thus when all sources are combined.


In [None]:
# Compute tf-idf
def tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing):
    return (doc_freq / doc_length) * math.log(n_docs_total / (1 + n_docs_containing))

for w,wv in invertedIndex.items():
    n_docs_total = corpusInfo['num_docs']
    n_docs_containing = len(wv['freq'])
    
    for d_id,dv in wv['freq'].items():
        doc_freq = dv
        doc_length = corpusInfo['doc_lengths'][d_id]
        
        invertedIndex[w]['tf-idf'][d_id] = tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing)