Done so far :


*   Lemmatization
*   Stop Words Removal

Verify :

* Normalization - removing accents, etc.
* Dates replaced with strings
* Case-folding
* Removed HTML entity codes



In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import wordninja 

####### After importing nltk, run the following only once ######
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')
### pip install wordninja ###

In [2]:
def remove_htmlcodes(document):
    
    replacement = {
                    "&ampnbsp": ' ',
                    "&ampamp": '&',
                    "&ampquot": '\'',
                    "&ampldquo": '\"',
                    "&amprdquo": '\"',
                    "&amplsquo": '\'',
                    "&amprsquo": '\'',
                    "&amphellip": '...',
                    "&ampndash": '-',
                    "&ampmdash": '-'
                  }
    
    for str in replacement:
        document = document.replace(str, replacement[str])
        
    return document

In [3]:
def get_wordnet_pos(word):
    
    tag=nltk.pos_tag([word])[0][1][0].upper()
    tag_dict={"J": wordnet.ADJ, 
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
    return tag_dict.get(tag,wordnet.NOUN)

def lemma_stop(str):
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer('\w+|\$]\d\[+|\S+,-')
    tokenized = tokenizer.tokenize(str)
    lemmatized = [lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in tokenized]
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in lemmatized if w.lower() not in stop_words]
    after_lemma_stop = ' '.join(w for w in filtered_sentence)
    return filtered_sentence

In [4]:
# loading data.npy
# data.npy is a 2D array containing the dataset information as
# data[i][0] : docID of ith document
# data[i][1] : title of ith document
# data[i][4] : content of ith document

data = np.load('data.npy',allow_pickle = True)
# sentence = data[0][4]
# print(sentence)

In [5]:
# creating a map {index_in_data_npy, docID}

# ex. if ith element in data has docID j,
# get_docID[i] will return j

get_docID = {}
get_index = {}

for i in range(0, len(data)) :
    get_docID[i] = int(data[i][0])
    get_index[int(data[i][0])] = i

In [6]:
def is_not_credible (text):
    
    match = re.search(r'[!@#?&{}()]', text)
    
    if match:
        return True
    else:
        return False

In [7]:
def scrub_words(text):
    
    text = re.sub('[!@#?&{}()]', '', text)
    text=re.sub(r'[^\x00-\x7F]'," ",text)
    return text

In [8]:
def clean_document (document_string):
    
    cleaned_doc = document_string
    for word in document_string.split():
                if is_not_credible(word):
                    temp= scrub_words(word)
                    split=wordninja.split(temp)
                    if len(split)>7:
                          cleaned_doc = cleaned_doc.replace(word,'')
                    else:
                        replace_with=' '.join(word for word in split)
                        cleaned_doc = cleaned_doc.replace(word, replace_with)
    return cleaned_doc

In [9]:
count_dates=[]

In [10]:
from datetime import datetime

def replace_dates(documentString, docID):
    
    # regEx = '(([0-9]+(/|\\.|-)[0-9]+(/|\\.|-)[0-9]+)|([0-9]+(/|\\.|-)[0-9]+))'
    regEx = '(([0-9]+(/)[0-9]+(/)[0-9]+)|([0-9]+(/)[0-9]+))'
    iterator = re.finditer(regEx, documentString)
    listOfDates = [(m.start(0), m.end(0)) for m in iterator]
    tmp = []
    replace_with = []
    for indices in listOfDates:
        date = documentString[indices[0]:indices[1]]
        tmp.append(date)
        # date = date.replace('.', '/')
        # date = date.replace('-', '/')
        count = date.count('/')
        newDate = ''
        if count == 2:
            check_year = date[-3]
            
            if check_year == '/':
                YY = date[-2:]
                
                if int(YY) <= 19:
                    proper_date = date[:-2] + '20' + YY
                    date = date.replace(date,proper_date)
                else:
                    proper_date = date[:-2] + '19' + YY
                    date = date.replace(YY,('19'+YY))
                    
            try:
                newDate = datetime.strptime(date, '%m/%d/%Y').strftime('%d %b %Y')
            except ValueError as ve:
                newDate = date
        else:
            try:
                newDate = datetime.strptime(date, '%m/%d').strftime('%d %b')
            except ValueError as ve:
                newDate = date
        count_dates.append([docID, date])
        newDate = newDate.replace(' ', '')
        replace_with.append(newDate)
        
    for i in range(len(tmp)):
        documentString = documentString.replace(tmp[i], replace_with[i])
    
    return documentString

In [11]:
# print(replace_dates('12/12/12 to 9/11'))

In [12]:
# creating a temporary smaller dataset

subset = []
counter = 0
for document in data:
    subset.append(document)
    counter += 1
    if counter == 1000:
        break

In [13]:
import time
from tqdm import tqdm

start = time.time()

titles = []
contents = []
for i in tqdm(range(len(subset))):
    # actually modifying the document
    subset[i][4] = remove_htmlcodes(subset[i][4])
    subset[i][1] = remove_htmlcodes(subset[i][1])
    subset[i][4] = clean_document(subset[i][4])
    subset[i][1] = clean_document(subset[i][1])
    
    # not actually modifying the document
    modifiedContent = replace_dates(subset[i][4], subset[i][0])
    modifiedContent = lemma_stop((modifiedContent))
    modifiedTitle = replace_dates(subset[i][1], subset[i][0])
    modifiedTitle = lemma_stop((modifiedTitle))
    
    # case-folding
    for i in range(len(modifiedContent)):
        modifiedContent[i] = modifiedContent[i].lower()
    for i in range(len(modifiedTitle)):
        modifiedTitle[i] = modifiedTitle[i].lower()
    
    titles.append(modifiedTitle)
    contents.append(modifiedContent)
    
print(time.time() - start)  # 110.26236414909363

100%|██████████| 1000/1000 [02:06<00:00,  7.90it/s]

126.60539221763611





In [14]:
import unidecode
contents_temp = contents

titles_temp = titles
for i in range(1000):
    for j in range(len(contents[i])):
        contents[i][j] = unidecode.unidecode(contents[i][j])
    for j in range(len(titles[i])):
        titles[i][j] = unidecode.unidecode(titles[i][j])

In [15]:
import pickle

with open('modified_contents.pickle', 'wb') as handle:
    pickle.dump(contents, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('modified_titles.pickle', 'wb') as handle:
    pickle.dump(titles, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

# To read the data again

# with open('modified_contents.pickle', 'rb') as handle:
#     contents = pickle.load(handle)
# with open('modified_titles.pickle', 'rb') as handle:
#     titles = pickle.load(handle)
    
# print(unserialized_title == titles)

In [16]:
import trie

# Create map from docID of the document to an object of class Node 
# (i.e, the corresponding document trie structure)
# ex. if the docID of the document is 1, 
# getReference[1] gives the object which is the trie structure of docID 1

getReference = {}

In [17]:
documentRoot = []
collection = trie.CollectionNode()

# initializing the root for 1000 documents
for i in range(1000):
    newDocument = trie.Node()
    documentRoot.append(newDocument)
    getReference[get_docID[i]] = newDocument

In [18]:
# creating the documents

max_tf = {}
N = 1000

with open('modified_contents.pickle', 'rb') as handle:
    contents = pickle.load(handle)
with open('modified_titles.pickle', 'rb') as handle:
    titles = pickle.load(handle)

import time
from tqdm import tqdm

start = time.time()
for i in tqdm(range(N)):
    for w in contents[i]:
        collection.add_document(w, 0, get_docID[i])
        documentRoot[i].add(w, 0)
        if get_docID[i] in max_tf:
            max_tf[get_docID[i]] = max(documentRoot[i].count_words(w, 0), max_tf[get_docID[i]])
        else:
            max_tf[get_docID[i]] = documentRoot[i].count_words(w, 0)
    for w in titles[i]:
        collection.add_title(w, 0, get_docID[i])
        
print(time.time() - start)  #39.19705152511597

100%|██████████| 1000/1000 [00:11<00:00, 84.34it/s] 

11.858062744140625





In [19]:
with open('collection.pickle', 'wb') as handle:
    pickle.dump(collection, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('documentRoot.pickle', 'wb') as handle:
    pickle.dump(documentRoot, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# reading from pickle files

with open('collection.pickle', 'rb') as handle:
    collection = pickle.load(handle)
with open('documentRoot.pickle', 'rb') as handle:
    documentRoot = pickle.load(handle)

In [20]:
# import math
# import queue

# documentLength = {}
# N = len(documentRoot)

# for i in tqdm(range(len(documentRoot))):
    
#     docID = get_docID[i]
#     length = 0
#     document = documentRoot[i]
#     q = queue.Queue()
#     q.put([document, ''])

#     while q.qsize() > 0:

#         current = q.get()
#         reference = current[0]
#         word = current[1]

#         if reference.words > 0:
#             df = len(collection.get_doc_list(word, 0))
#             idf = math.log10(N/df)
#             # print(word, reference.words, df)
#             length += (reference.words * idf) ** 2

#         for i in range(256):
#             if reference.children[i] is not None:
#                 new_word = word + chr(i)
#                 q.put([reference.children[i], new_word])

#     # print(length**0.5)
#     documentLength[docID] = length**0.5

In [51]:
query = '6/19'
final_query = replace_dates(query, -1)
final_query = lemma_stop(final_query)

for i in range(len(final_query)):
    final_query[i] = unidecode.unidecode(final_query[i])
    # case-folding
    final_query[i] = final_query[i].lower()
print(final_query)

['19jun']


In [52]:
tf_query = {}
for w in final_query:
    if w not in tf_query:
        tf_query[w] = 1
    else:
        tf_query[w] += 1

***Ranked Retrieval based on TF-IDF Score :***


In [53]:
import queue

# scores[i] stores the dot product of the tf-idf score vectors of the query and document of docID i in the corpus
scores = {}
title_score = {}

# N is the total number of documents in the corpus
N = len(documentRoot)

# wordsInDoc[i] is a sorted list of (word, score) tuples where
# score is the tf-idf score for the (word, <ith doc>) pair
wordsInDoc = {}

factor = {}

import math
import bisect

for query_term in tf_query:
    
    docs_having_query_term = collection.get_doc_list(query_term, 0)
    df = len(docs_having_query_term)
    idf = 0
    
    print('-------------------------------------')
    print('Term in query = ', query_term)
    print()
    
    if df == 0:
        idf = 0
    else:
        idf = math.log10(N/df)
        
    docs_having_query_term_in_title = collection.get_title_list(query_term,0)
    
    for docID in docs_having_query_term_in_title:
        if docID in title_score:
            title_score[docID] += idf
        else:
            title_score[docID] = idf
        
    print('df = ',df)
    print('idf = ',idf)
    
    tfidf_query = tf_query[query_term] * idf
        
    for docID in docs_having_query_term:
        
        tf_doc = getReference[docID].count_words(query_term, 0)
        tf_doc = 0.5 + 0.5*tf_doc/max_tf[docID]
        tfidf_doc = (tf_doc)
        
        if docID not in scores:
            scores[docID] = (tfidf_query * tfidf_doc)
            wordsInDoc[docID] = []
            bisect.insort(wordsInDoc[docID], [-tfidf_query * tfidf_doc, query_term])
            factor[docID] = idf
        else:
            scores[docID] += (tfidf_query * tfidf_doc)
            bisect.insort(wordsInDoc[docID], [-tfidf_query * tfidf_doc, query_term])
            factor[docID] += idf
            
print(title_score)

for docID in scores:
    
    #if documentLength[docID] != 0:
    scores[docID] *= factor[docID]
    if docID in title_score:
        scores[docID] *= 1 + title_score[docID]

sorted_scores = sorted(scores.items(), key = lambda kv : kv[1] , reverse = True)

maxshow = min(10, len(scores))

print('\n\n')
print('============================================')

for i in range(maxshow):
    
    print()
    docID = sorted_scores[i][0]
    print('doc ID = ', docID)
    cnt = 0
    print('Keywords:')
    print()
    print(subset[get_index[sorted_scores[i][0]]][1])
    print()
    if sorted_scores[i][0] not in title_score:
        print('title score = ',0)
    else:
        print('title score = ',title_score[sorted_scores[i][0]])
    for j in range(len(wordsInDoc[docID])):
        print(wordsInDoc[docID][j][1], wordsInDoc[docID][j][0], end = ' ')
        print(getReference[docID].count_words(wordsInDoc[docID][j][1], 0))
    print()
    print()
    count = 0
    found = 0
    words_before=queue.Queue()
    at_start = 1
    display = ""
    
    for word in subset[get_index[docID]][4].split():
            
        check_with=replace_dates(word, -1)
        check_with = check_with.lower()
        if len(lemma_stop(check_with)) > 0:
            check_with=lemma_stop(check_with)[0]
        else:
            check_with=word
        
        if check_with == wordsInDoc[docID][0][1]:
            found=1
            
        if found == 1:
            display = display + word + " "
            count += 1
            if count == 50:
                break
        if found == 0:
            words_before.put(word)
            if words_before.qsize()>20:
                remove=words_before.get()
                at_start=0
                
    if not at_start:
        print('...', end = ' ')
    while words_before.qsize() > 0:
        print(words_before.get(), end = ' ')
    print(display, end = ' ')
    print('...', end = ' ')
    print('\n')
    print('tf-idf score=', sorted_scores[i][1])
    print('\n')
    print('============================================')

-------------------------------------
Term in query =  19jun

df =  1
idf =  3.0
{}




doc ID =  6981
Keywords:

Amazon's Fire TV is safer for kids with latest update

title score =  0
19jun -1.6666666666666667 1


... of the boob tube anyway, Amazon is betting that doing it in a pre-set environment will be better for everyone.Update 6/19 3:00PM: Amazon now says that the initial rollout will start within the next week, but not today.  ... 

tf-idf score= 5.0




In [24]:
print((count_dates))

[[16, '50/50'], [42, '1/2'], [47, '4/4'], [47, '4/5'], [47, '3/4'], [47, '4/5'], [67, '24/7'], [70, '50/50'], [104, '9/11'], [151, '9/11'], [6432, '9/11'], [6440, '9/11'], [6492, '9/11'], [6620, '12/8'], [6701, '24/7'], [6765, '24/7'], [6802, '9/11'], [6811, '2008/8'], [6812, '12/22'], [6919, '1/5000'], [6919, '1/11'], [6919, '1/150'], [6919, '1/750'], [6919, '1/5000'], [6977, '9/11'], [6979, '1/2'], [6981, '6/19'], [7078, '9/11'], [7127, '9/11'], [7313, '24/7'], [-1, '2/5']]


In [44]:
print(subset[get_index[47]][4])

      The order came from the Sultan of Brunei’s nephew. It was 1994, and Prince Abdul Hakeem, then 20 years old, had inherited two things: access to a $40 billion fortune, and his family’s penchant for spending it.              The Secret Six         A half-dozen groundbreaking Ferraris were built for the Prince of Brunei.Only one made it out of the jungle aliveBy Ryan ZumMallen | Photography by Paul Barshon The order came from the Sultan of Brunei’s nephew. It was 1994, and Prince Abdul Hakeem, then 20 years old, had inherited two things: access to a $40 billion fortune, and his family’s penchant for spending it.In the ‘80s and ‘90s, the monarchs of this tiny, oil-rich nation in the South China Sea bankrolled a fleet of special custom vehicles, ordering the most coveted cars in the world by the half-dozen or more. Their notorious collection — located in nondescript concrete garages, surrounded by razor wire, and patrolled by armed Nepalese soldiers known as gurkhas — numbered in the 

In [26]:
a = [['aaaa'],['bbbb']]
for document in a:
    document[0] = ''
    document = []
print(a)

[[''], ['']]
