In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter,defaultdict
import pickle
from itertools import chain

----
# Creating inverted indexes for term frequency and term positions

In [2]:
def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [9]:
# YOU ONLY HAVE TO CHANGE THIS BLOCK OF CODE

# Filename
source = 'data-hl.json' # CHANGE THIS

# Open json file 
with open(source) as f:
    data = json.load(f)

# Make dataframe from data file 
df = pd.DataFrame(data) # MAKE SURE YOUR DATAFRAME HAS A COLUMN CALLED 'url'

# Create small subset for debugging (can be commented out for final run)
#df = df.head(100)

# Tokenize content, remove stopwords and stem
# Note : the following line includes the 'title' and 'content' column
df['tokenized'] = df.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1) # CHANGE THIS


In [47]:
# Check if everything worked as expected
df.shape

(10000, 5)

In [10]:
# Create inverted index
#invertedIndexFreq = defaultdict(Counter)
invertedIndexPos = defaultdict(dict)

#corpusInfo = defaultdict(dict)
#corpusInfo['num_docs'] = df.shape[0]

for index, row in df.iterrows():
    #corpusInfo['doc_lengths'][row['url']] = len(row['tokenized'])
        
    for w in row['tokenized']:
        #invertedIndexFreq[w][row['url']]+=1
        invertedIndexPos[w][row['url']] = [i for i, j in enumerate(row['tokenized']) if w == j]

In [7]:
invertedIndexPos

defaultdict(dict,
            {'khloe': {'https://hollywoodlife.com/2019/02/19/khloe-kardashian-friend-malika-haqq-slams-jordyn-woods-cheating-tristan-thompson/': [0,
               9,
               64,
               155,
               208],
              'https://hollywoodlife.com/2019/02/19/khloe-kardashian-comments-tristan-thompson-jordyn-woods-cheating-kylie-jenner-bff/': [0,
               10,
               79,
               114,
               129,
               143,
               187,
               201,
               263],
              'https://hollywoodlife.com/feature/men-who-have-cheated-on-kardashian-jenner-sisters-unfaithful-boyfriends-pics-eg19/': [109,
               124,
               125,
               159],
              'https://hollywoodlife.com/2019/02/20/khloe-kardashian-first-pic-since-split-tristan-thompson-cheating-scandal-jordyn-woods/': [1,
               12,
               82,
               87,
               127,
               161,
            

In [None]:
# Storing the inverted index with frequences
with open(source[:-5]+'-invertedIndexFreq.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
# Storing the inverted index with positions
with open(source[:-5]+'-invertedIndexPos.pickle', 'wb') as handle:
    pickle.dump(invertedIndexPos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [50]:
# Storing the corpus info
with open(source[:-5]+'-corpusInfo.pickle', 'wb') as handle:
    pickle.dump(corpusInfo, handle, protocol=pickle.HIGHEST_PROTOCOL)

------
# Combining all indexes into one


### FREQUENCY INDEX

In [11]:
a = invertedIndexPos
b = invertedIndexPos

# Initialize final defaultdict
ii_freq_merged = defaultdict(Counter)

# Merge indexes
for k,v in chain(a.items(), b.items()):
    ii_freq_merged[k].update(v)

In [51]:
# Load inverted indexes
ii_freq_imdb = pickle.load( open( "data-imdb-invertedIndexFreq.pickle", "rb" ) )
ii_freq_tmz = pickle.load( open( "data-tmz-invertedIndexFreq.pickle", "rb" ) )
ii_freq_mw = pickle.load( open( "data-mw-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rr = pickle.load( open( "data-rr-invertedIndexFreq.pickle", "rb" ) )
ii_freq_hl = pickle.load( open( "data-hl-invertedIndexFreq.pickle", "rb" ) )
ii_freq_rt = pickle.load( open( "data-rt-invertedIndexFreq.pickle", "rb" ) )

# Initialize final defaultdict
ii_freq_merged = defaultdict(Counter)

# Merge indexes
for k,v in chain(ii_freq_imdb.items(), ii_freq_tmz.items(), 
                 ii_freq_mw.items(), ii_freq_rr.items(),
                 ii_freq_hl.items(), ii_freq_rt.items()):
    
    ii_freq_merged[k].update(v)

In [52]:
# Storing the corpus info
with open('merged-invertedIndexFreq', 'wb') as handle:
    pickle.dump(ii_freq_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

### POSITION INDEX

In [66]:
# Load inverted indexes
ii_pos_imdb = pickle.load( open( "data-imdb-invertedIndexPos.pickle", "rb" ) )
ii_pos_tmz = pickle.load( open( "data-tmz-invertedIndexPos.pickle", "rb" ) )
ii_pos_mw = pickle.load( open( "data-mw-invertedIndexPos.pickle", "rb" ) )
ii_pos_rr = pickle.load( open( "data-rr-invertedIndexPos.pickle", "rb" ) )
ii_pos_hl = pickle.load( open( "data-hl-invertedIndexPos.pickle", "rb" ) )
ii_pos_rt = pickle.load( open( "data-rt-invertedIndexPos.pickle", "rb" ) )

# Initialize final defaultdict
ii_pos_merged = defaultdict(Counter)

# Merge indexes
for k,v in chain(dict(ii_pos_hl).items(), dict(ii_pos_mw).items()):
    ii_pos_merged[k].extend(v)

AttributeError: 'Counter' object has no attribute 'extend'

In [None]:
# Storing the corpus info
with open('merged-invertedIndexFreq', 'wb') as handle:
    pickle.dump(ii_freq_merged, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [62]:
for k, v in chain(ii_pos_imdb['soulja'], ii_pos_tmz['soulja']):
    print(k,v)

ValueError: too many values to unpack (expected 2)

Counter({'https://www.tmz.com/2019/02/17/soulja-boy-search-warrant-home-cop-cars-slashed-tires/': [0,
          24,
          62,
          73,
          78,
          86,
          93],
         'https://www.tmz.com/2019/02/15/soulja-boy-blac-chyna-couple-shopping-beverly-hills-valentines-day/': [0,
          27,
          40],
         'https://www.tmz.com/2019/02/13/soulja-boy-blac-chyna-dating-couple-instagram-dm/': [0,
          21,
          98],
         'https://www.tmz.com/2019/02/18/soulja-boy-blac-chyna-dating-tyga-retaliation-relationship/': [0,
          38],
         'https://www.tmz.com/2019/02/04/soulja-boy-kidnapping-investigation-suspect-woman-hostage/': [0,
          20,
          39,
          62,
          124,
          134,
          138,
          148],
         'https://www.tmz.com/2019/02/05/soulja-boy-kidnapping-woman-kayla-liar/': [0,
          28,
          40,
          49,
          68,
          85,
          98,
          102],
         'https://www.tmz

------
# Computing TF-IDF
## !! First the seperate indexes need to be combined
We can only compute the tf-idf if we have the word frequency in the WHOLE corpus (ALL documents), thus when all sources are combined.


In [None]:
# Compute tf-idf
def tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing):
    return (doc_freq / doc_length) * math.log(n_docs_total / (1 + n_docs_containing))

for w,wv in invertedIndex.items():
    n_docs_total = corpusInfo['num_docs']
    n_docs_containing = len(wv['freq'])
    
    for d_id,dv in wv['freq'].items():
        doc_freq = dv
        doc_length = corpusInfo['doc_lengths'][d_id]
        
        invertedIndex[w]['tf-idf'][d_id] = tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing)