In [167]:
from bs4 import BeautifulSoup
import time
import re
import string
import os
from collections import Counter,OrderedDict
import nltk
from nltk.tokenize import word_tokenize
import glob
import pickle
import math
import operator

In [168]:
inverted_unigram_dict = dict()
unigram_termcount = {}
unigram_corpus_count = 0
path = 'corpus'
smoothening_factor = 0.35


def casefolding(data):
    return data.lower()

def punctuationHandling(data):
    regex = r"(?<!\d)[.,;:*!\"\'#$%&()+/<=>?@[\]_^`~{|}∑α](?!\d)"
    data = re.sub(regex, "", data, 0)
    data = re.sub(r'\d+', '', data)
    regex = r"[;:*!\"\'#$%&()+/<=>?@[\]_^`~{|}]"
    data = re.sub(regex, "", data, 0)
#     data = data.strip(string.punctuation)
    return data

def removeWhitespace(data):
    data = ' '.join(data.split())
    return data

def createCorpusFile(heading,maincontent):
    fo = open("corpus/"+str(heading)+".txt", "w")
    fo.write(maincontent)
    fo.close()

def parseDocs():
    for filename in glob.glob("test-collection/cacm/*.html"):
        fo = open(filename, "r")
        heading = os.path.basename(filename).split(".")[0]
        data = fo.read()
        maincontent = BeautifulSoup(data, "lxml").text
        maincontent = casefolding(maincontent)
        maincontent = punctuationHandling(maincontent)
        maincontent = removeWhitespace(maincontent)
        createCorpusFile(heading,maincontent)
        
def createIndexDict(file, ngram_dict, inverted_dict):
    for key,value in ngram_dict.items():
        if (inverted_dict.get(key)):
            inverted_dict.get(key).append((os.path.basename(file).split(".")[0],value))
        else:
            inverted_dict[key] = [(os.path.basename(file).split(".")[0],value)]
    return inverted_dict

def getInvertedListCount(index_list):
    invertedlist_count = {}
    for key,val in index_list.items():
        invertedlist_count[key] = [len(val),val]
    return invertedlist_count

def getDocWordFreq(word,document):
    value = 0
    if unigram_invertedlist_count.get(word):
        for val in unigram_invertedlist_count[word][1]:
            if(val[0]==document):
                value = val[1]
    return value

def getCorpusWordFreq(word):
    if unigram_invertedlist_count.get(word):
        return unigram_invertedlist_count[word][0]
    else:
        return 0
    
def query_preprocessor(filepath = 'test-collection/cacm.query.txt'):
    with open(filepath) as f: queries = f.read()
    queries = [l.replace('</DOCNO>', '').replace('\n', ' ').replace('</DOC>', '').replace('<DOC>', '')[1:] for l in queries.split('<DOCNO>')]
    queries = [re.sub(r'^\d*\s\s', '',l) for l in queries]
    queries = [s.lower() for s in queries]
    queries = [punctuationHandling(query) for query in queries]
    queries = [removeWhitespace(query) for query in queries]
    return queries

In [158]:
def pseudo_relavance(query,bm25_score_dict):
    k = 10
    #Generate query vector
    query_vector = {}
    query_list = query.split(" ")
    
    for query_word in query_list:
        if(query_vector.get(query_word)):
            query_vector[query_word] += 1
        else:
            query_vector[query_word] = 1
            
    for key,val in unigram_invertedlist_count.items():
        if not query_vector.get(key):
            query_vector[key] = 0
    relevance_vector,relevance_vector_magnitude = generate_relevance_vector(query,bm25_score_dict,k)
    
    non_relevance_vector,non_relevance_vector_magnitude = generate_non_relevance_vector(query,bm25_score_dict,k)

    expanded_q = query_expansion(query,query_vector,relevance_vector,relevance_vector_magnitude,
                    non_relevance_vector,non_relevance_vector_magnitude)
   
    return(expanded_q)

In [159]:
def generate_relevance_vector(query,bm25_score_dict,k):
    sorted_doc = dict(sorted(bm25_score_dict.items(), key=operator.itemgetter(1), reverse=True)[:k])
    relevance_vector = {}
    for key,val in sorted_doc.items():
        with open ('corpus/'+key+'.txt', 'r') as f: 
            doc_list = f.read().split()
            for term in doc_list:
                if(relevance_vector.get(term)):
                    relevance_vector[term] += 1
                else:
                    relevance_vector[term] = 1

            for token,val in unigram_invertedlist_count.items():
                if not relevance_vector.get(token):
                        relevance_vector[token] = 0
                        
    relevance_vector_magnitude = 0
    for key,val in relevance_vector.items():
        relevance_vector_magnitude += float(val**2)
    
    relevance_vector_magnitude = math.sqrt(relevance_vector_magnitude)
    return(relevance_vector,relevance_vector_magnitude)

In [160]:
def generate_non_relevance_vector(query,bm25_score_dict,k):
    sorted_doc = dict(sorted(bm25_score_dict.items(), key=operator.itemgetter(1), reverse=True)[k:])
    non_relevance_vector = {}
    for key,val in sorted_doc.items():
        with open ('corpus/'+key+'.txt', 'r') as f: 
            doc_list = f.read().split()
            for term in doc_list:
                if(non_relevance_vector.get(term)):
                    non_relevance_vector[term] += 1
                else:
                    non_relevance_vector[term] = 1

            for token,val in unigram_invertedlist_count.items():
                if not non_relevance_vector.get(token):
                        non_relevance_vector[token] = 0
                        
    non_relevance_vector_magnitude = 0
    for key,val in non_relevance_vector.items():
        non_relevance_vector_magnitude += float(val**2)
    
    non_relevance_vector_magnitude = math.sqrt(non_relevance_vector_magnitude)
    return(non_relevance_vector,non_relevance_vector_magnitude)

In [161]:
def query_expansion(query,query_vector,relevance_vector,relevance_vector_magnitude,\
                    non_relevance_vector,non_relevance_vector_magnitude):
    query_expansion_dict = {}
    for term,val in unigram_invertedlist_count.items():
        query_expansion_dict[term] = query_vector[term]+ (0.5/relevance_vector_magnitude) * relevance_vector[term] -\
        (0.15/non_relevance_vector_magnitude) * non_relevance_vector[term]
    
    query_expansion_dict = dict(sorted(query_expansion_dict.items(), key=operator.itemgetter(1), reverse=True))
    
    expanded_query = query
    no_extra_query_terms = 15
    for i in range(no_extra_query_terms):
        term =  list(query_expansion_dict.keys())[i]
        if term not in query:
            expanded_query+= " "+term
    return expanded_query

In [None]:
def get_all_new_queries():
    for old_query in range(len(all_queries)):
        expanded_q = pseudo_relavance(all_queries[old_query],bm25_score_dict)
        expanded_query_list.append(expanded_q)
        
    return expanded_query_list

In [None]:
def writeToFile(queryid,queryname,lmscore_dict,folder_name,system_name):
    fo = open("baseline-runs/"+folder_name+"/"+ "Q" + str(queryid) +".txt", "w")
    for key,val in lmscore_dict.items():
        rank = list(lmscore_dict.keys()).index(key)+1
        fo.write(str(queryid)+"\tQ0\t"+str(key)+"\t"+str(rank)+"\t"+str(val)+"\t"+ system_name +"\n")
    fo.close()

In [163]:
expanded_query_list = []
for filename in glob.glob("corpus/*.txt"):
    fo = open(filename, "r")
    data = fo.read()
    tokens = nltk.word_tokenize(data)
    unigramlist = nltk.word_tokenize(data)
    unigram_termcount[os.path.basename(filename).split(".")[0]] = len(unigramlist)
    unigram_corpus_count = unigram_corpus_count + len(unigramlist)
    unigram_dict = Counter(unigramlist)
    inverted_unigram_dict = createIndexDict(filename, unigram_dict, inverted_unigram_dict)
    fo.close()
    
unigram_invertedlist_count = getInvertedListCount(inverted_unigram_dict)
number_of_docs = len(glob.glob('corpus/*.txt'))
with open('reusable_data/unigram_invertedlist_count.pkl', 'wb') as f:
    pickle.dump(unigram_invertedlist_count, f)
    
with open('reusable_data/inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_unigram_dict, f)

all_queries = query_preprocessor()[1:]

for i in range(len(all_queries)):
    print(all_queries[i])
    bm25_score_dict = populate_bm25(all_queries[i])
    writeToFile(i+1,all_queries[i],bm25_score_dict,"task1-bm25","ccisneu_wordunigram_BM25")

all_expanded_queries = get_all_new_queries()

for i in range(len(all_expanded_queries)):
    bm25_score_dict_expanded = populate_bm25(all_expanded_queries[i])
    writeToFile(i+1,all_queries[i],bm25_score_dict_expanded,"task2-bm25-pseudo-relevance","ccisneu_wordunigram_BM25_PSEUDO_RELEVANCE")

{'CACM-1519': 24.64291001264796, 'CACM-1605': 23.148162233549694, 'CACM-1410': 22.33330316789089, 'CACM-1591': 22.29077525815539, 'CACM-1033': 22.288959392145333, 'CACM-1506': 21.989576069246546, 'CACM-1161': 21.101968184738794, 'CACM-2319': 20.927837849632983, 'CACM-0585': 20.79347097377166, 'CACM-2379': 20.548433949383885, 'CACM-1698': 20.509085403312735, 'CACM-1680': 20.394312018199077, 'CACM-1938': 20.27476212484826, 'CACM-2054': 20.156604992450905, 'CACM-3048': 20.112135059644178, 'CACM-1844': 19.64595985147516, 'CACM-1264': 19.588225124500664, 'CACM-1750': 19.57764677467663, 'CACM-1523': 19.44969520614981, 'CACM-3127': 19.300844835949388, 'CACM-1544': 19.188042794803522, 'CACM-3025': 19.088333428950925, 'CACM-2371': 19.062308896129696, 'CACM-2357': 19.002872646227104, 'CACM-2380': 18.960415721057554, 'CACM-2947': 18.8593754495853, 'CACM-1168': 18.649673825895533, 'CACM-1315': 18.431842043017898, 'CACM-2535': 18.31534295256021, 'CACM-1827': 18.041805239516613, 'CACM-0637': 17.8308

KeyError: 'udo'