In [106]:
from collections import defaultdict
import re
import math
from tqdm import tqdm_notebook as tqdm
from Stemmer import Stemmer
from spacy.lang.en import English
import bisect

In [105]:
fields = ['title','infobox','category','body','ref']
type_keys = ['t','i','c','b','r','l']
inv_index = defaultdict(lambda: defaultdict(lambda : defaultdict(int)))
sec_index = []

In [126]:
sec_index

['', 'collapsiblenav', 'odnbweb']

In [42]:
tf = {}
with open('index/tf.txt') as file:
    lines = file.readlines()
    for line in lines:
        text = line.split("|")                 # change the delimiter
        tf[int(text[0])] = [text[1],int(text[2])]
doc_count = len(tf.keys())


In [None]:
def comp(a):
    return int(a.split('.')[0])
    
def gen_sind(sindex): 
    sec_index.clear()
    with open(sindex,'r') as fil:
        fil_lis = fil.readlines()
    for fil in fil_lis:
        word = fil.split("|")[1]
        sec_index.append(word.strip('\n'))


def get_file(word):
    ind = bisect.bisect_right(sec_index,word) - 1
    return str(ind) + ".txt"

gen_sind('index/sindex.txt')

In [14]:
def parse_dict(coded, key):
    if coded == '\n' :
        return
    typs = re.sub('[0-9]','|',coded)
    typs = typs.split('|')
    typs = [x for x in typs if x]
    
    vals = re.sub('[a-z]','|',coded)
    vals = vals.split('|')
    vals = [int(x) for x in vals if x ]
    doc = vals[0]
    total = 0
    for i in range(1,len(vals)):
        inv_index[key][doc][typs[i-1]] = vals[i]
        total += vals[i]
    inv_index[key][doc]['a'] = total

In [110]:
def parse_index(folder,sindex,wrd):
    file = folder + get_file(wrd)
    with open(file) as fil:
        lines = fil.readlines()
    for line in lines:
        text = line.split('|')
        word = text[0]
        if word != wrd:
            continue
        for coded in text:
            if coded == word:
                continue
            parse_dict(coded,word)
            
def form_index(query):
    for q in query:
        parse_index('index/','sindex.txt',q)


In [29]:
nlp = English()
ps = Stemmer('porter')

def modify_query(search):
    tokens = re.split(r'[^A-Za-z0-9]+',search)
    final = []
    for token in tokens:
        if len(token) == 0 or nlp.vocab[token].is_stop == True:
            continue
        token = ps.stemWord(token.lower())
        final.append(token)
    return final

In [139]:
def search(query,field):
    query = modify_query(query)
    form_index(query)
    docs = []
    for token in query:
        temp = []
        for key in inv_index[token].keys():
            if inv_index[token][key][field] > 0:
                temp.append(int(key))
        docs = list(set(docs).union(set(temp)))
    docs = list(set(docs))
    results = []
    for doc in docs:
        total = 0
        distinct = 0
        title_count = 0 
        for token in query:
            if inv_index[token][doc][field] > 0:
                distinct += 1
            if inv_index[token][doc]['t'] > 0:
                title_count += 1
            total += inv_index[token][doc]['a']
            idf = math.log(doc_count/len(inv_index[token].keys()))
            tfsc = inv_index[token][doc]['a']/tf[doc][1]
        if field == 'a':
            results.append([[distinct,[title_count,tfsc*idf*total]],doc])
        else:
            results.append([[distinct,tfsc*idf*total],doc])
    results = sorted(results,reverse=True)

    if field == 'a':
        final_res = []
        for i in range(min(10,len(results))):
            final_res.append([tf[results[i][1]][0],results[i][0]])
    else:
        final_res = {}
        for i in range(len(results)):
            final_res[results[i][1]] = results[i][0][1]
    return final_res

In [89]:
def exec_fquery(queries):
    qresults = defaultdict(lambda:{})
    docs = set([])
    for token in queries:
        typ,query = token.split(':')
        typ = typ[0]
        qresults[typ] = search(query,typ)
        for d_id,coun in qresults[typ].items():
            docs.add(d_id)
            
    results = []
    for doc in docs:
        distinct = 0
        total = 0
        for typ in qresults.keys():
            if doc in qresults[typ].keys():
                distinct += 1
                total += qresults[typ][doc]
        results.append([[distinct,total],doc])
    results = sorted(results,reverse=True)
    final_res = []
    for i in range(min(10,len(results))):
        final_res.append([tf[results[i][1]][0],results[i][0]]) 
    return final_res

In [72]:
search('new york mayor','a')

[["A Mayor's Life: Governing New York's Gorgeous Mosaic",
  [3, [3, 0.6374749921830389]]],
 ['1957 New York City mayoral election', [3, [3, 0.06094333374225238]]],
 ['1961 New York City mayoral election', [3, [3, 0.059796967693063]]],
 ['1965 New York City mayoral election', [3, [3, 0.05595167765138047]]],
 ['New Orleans school desegregation crisis', [3, [1, 0.0022385426189111095]]],
 ['1931 New Year Honours', [3, [1, 0.00037530049018041976]]],
 ['Wikipedia:Articles for deletion/Kelly McDowell',
  [3, [0, 0.042197336888630914]]],
 ['Moses J. Wentworth', [3, [0, 0.018688558312810785]]],
 ['List of assassinations in Asia', [3, [0, 0.013580336200548809]]],
 ['Center on Global Energy Policy', [3, [0, 0.013175214495391085]]]]

In [78]:
def parse_query(query):
    tokens = query.split(' ')
    ans = []
    cur = ""
    for token in tokens:
        if ':' in token:
            if cur != "":
                ans.append(cur)
            cur = ""
            temp = token.split(':')
            cur += temp[0] +":" +temp[1]
        else:
            cur += " " + token
    ans.append(cur)
    return ans

def is_field(query):
    for token in query:
        text = token.split(":")
        if text[0] not in fields:
            return False
    return True

In [74]:
def start_search(query):
    inv_index.clear()
    query = " ".join(query.split())
    query = parse_query(query)
    if is_field(query):
        return exec_fquery(query)
    else:
        query = " ".join(query)
        return search(query,'a')

In [140]:
start_search('title:gandhi body:arjun infobox:gandhi category:gandhi ref:gandhi')

[['Gandhi Before India', [3, 35.38798129097616]],
 ['File:Gandhi Before India.jpg', [2, 8.150568624817286]],
 ['Great Soul: Mahatma Gandhi and His Struggle With India',
  [1, 1.9010072594326028]],
 ['Shakti Prasad', [1, 1.41664281026259]],
 ['Gandhi Djuna', [1, 1.108920901335685]],
 ['City Montessori School, Aliganj Branch', [1, 0.9248143424266717]],
 ['Kalidas Rangalaya', [1, 0.5287569860673464]],
 ['City Montessori School, Indira Nagar Branch', [1, 0.45711243261165646]],
 ['Adyaksha', [1, 0.4354824167378913]],
 ['Whisky Is Risky', [1, 0.3836179810689022]]]