In [58]:
# importing libraries
import pandas as pd
import numpy as np
import glob
import sys
import spacy
from LinkedList import LinkedList

nlp = spacy.load('en_core_web_sm')
sys.setrecursionlimit(10000)

In [59]:
auto_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/auto.csv')
property_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/property.csv')

In [60]:
def create_postings_list(x):
    x = str(x)
    posting_list = set()
    for word in x.split():
        posting_list.add(word.lower())
    posting_list = list(posting_list)
    # remove strings with only punctuations
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~=+'''
    for word in posting_list:
        if word in punctuations:
            posting_list.remove(word)
    return sorted((posting_list))

auto_df['posting_list'] = auto_df['tokenized'].apply(create_postings_list)
property_df['posting_list'] = property_df['tokenized'].apply(create_postings_list)

In [61]:
main_df = pd.concat([auto_df, property_df])
main_df = main_df.reset_index(drop=True)
corpus = set()
for l in main_df.posting_list:
    for word in l:
        corpus.add(word)
corpus = sorted(list(corpus))

In [62]:
def create_inverted_list(df):
    inverted_list = {}
    for word in corpus:
        inverted_list[word] = LinkedList()
    for row in df.iterrows():
        l = row[1]["posting_list"]
        for word in l:
            inverted_list[word].append(row[0])
    for word in inverted_list:
        inverted_list[word].sort()
    return inverted_list

inverted_list = create_inverted_list(main_df)

In [63]:
def get_all_rotations(s):
    rotations = []
    for i in range(len(s)):
        rotations.append(s[i:] + s[:i])
    return rotations

In [64]:

def permuterm_indexing(inv_list):
    perm_index = {}
    for word in inv_list:
        word_perm = word + "$"
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in perm_index:
                perm_index[q] = LinkedList()
            perm_index[q].append(word)
    return perm_index

    
perm_index = permuterm_indexing(inverted_list) 

In [65]:
def reverse_permuterm_indexing(inv_list):
    rev_perm_index = {}
    for word in inv_list:
        word_perm = "$" + word
        word_perm = word_perm[::-1] 
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in rev_perm_index:
                rev_perm_index[q] = LinkedList()
            rev_perm_index[q].append(word)
    return rev_perm_index

rev_perm_index = reverse_permuterm_indexing(inverted_list)

In [66]:
def left_permuterm_indexing(query, perm_index):
    result = []
    query = query + "$"
    rotations = get_all_rotations(query)
    for rotation in rotations:
        if rotation[0] == "*":
            q = rotation[2:]
            if q in perm_index:
                for word in perm_index[q]:
                    result.append(word.data)
    return result
    
def right_permuterm_indexing(query, rev_perm_index):
    result = []
    query = "$" + query
    query = query[::-1]
    rotations = get_all_rotations(query)
    for rotation in rotations:
        if rotation[0] == "*":
            q = rotation[2:]
            if q in rev_perm_index:
                for word in rev_perm_index[q]:
                    result.append(word.data)
    return result

def query_permuterm_index(query, perm_index, rev_perm_index, inv_list):
    result = []
    if "*" in query:
        if query[-1] == "*":
            result = left_permuterm_indexing(query, perm_index)
        elif query[0] == "*":
            result = right_permuterm_indexing(query, rev_perm_index)
                            
        else:
            halves = query.split("*")
            left_result = left_permuterm_indexing(halves[0] + "*", perm_index)
            right_result = right_permuterm_indexing("*" + halves[-1], rev_perm_index)
            result = list(set(left_result) & set(right_result))
            
    docs = []
    for word in result:
        for id in inv_list[word]:
            docs.append(id.data)    
    return sorted(list(set(docs)))


In [67]:
trial = query_permuterm_index("g*l*e", perm_index, rev_perm_index, inverted_list)

In [68]:
trial

[8,
 10,
 12,
 24,
 29,
 30,
 75,
 96,
 128,
 157,
 236,
 265,
 266,
 267,
 276,
 341,
 471,
 498,
 520,
 590,
 630,
 712,
 744,
 853,
 881,
 889,
 897,
 899,
 900,
 901,
 902,
 903,
 914,
 915,
 919,
 923,
 924,
 932,
 940,
 941,
 947,
 948,
 949,
 950,
 951,
 952,
 958,
 1028,
 1093,
 1108,
 1110,
 1111,
 1129,
 1132,
 1208,
 1210,
 1293,
 1297,
 1298,
 1299,
 1308,
 1315,
 1325,
 1327,
 1332,
 1337,
 1344,
 1351,
 1357,
 1363,
 1364,
 1365,
 1367,
 1370,
 1371,
 1372,
 1377,
 1388,
 1389,
 1390,
 1391,
 1394,
 1401,
 1420,
 1443,
 1461,
 1463,
 1464,
 1466,
 1510,
 1516,
 1519,
 1566,
 1571,
 1647,
 1669,
 1702,
 1731,
 1748,
 1781,
 1794,
 1820,
 1845,
 1847,
 1848,
 1850,
 1854,
 1856,
 1858,
 1859,
 1884,
 1887,
 1908,
 1920,
 1923,
 1931,
 1936,
 1954,
 1973,
 1989,
 2009,
 2256,
 2265,
 2277,
 2280,
 2285,
 2294,
 2295,
 2298,
 2301,
 2310,
 2358,
 2360,
 2366,
 2380,
 2387,
 2394,
 2403,
 2413,
 2414,
 2457,
 2458,
 2460,
 2474,
 2476,
 2479,
 2480,
 2483,
 2487,
 2500,
 2502,

In [69]:
def multi_query(queries, _and=False):
    docs = []
    for query in queries:
        if "*" in query:
            docs.append(query_permuterm_index(query, perm_index, rev_perm_index, inverted_list))
            
        else:
            intermediate_docs = []
            for id in inverted_list[query]:
                intermediate_docs.append(id.data)
            docs.append(intermediate_docs)
    # return docs
    if not _and:   
        # return union of all sublists in docs
        result = []
        for l in docs:
            for id in l:
                if id not in result:
                    result.append(id)
        return sorted(result)
    else:
        result = set(docs[0])
        for l in docs:
            result = result.intersection(set(l))
        return sorted(list(result))

In [70]:
results = multi_query(["g*e", "car"], _and=True)

In [71]:
results

[10,
 914,
 932,
 940,
 941,
 1364,
 1370,
 1391,
 1443,
 1516,
 1989,
 2387,
 2476,
 2487,
 2582,
 2586,
 2634,
 2655,
 2718,
 2791]

In [72]:
def engine_step_one(queries):
    # seperate queries into two lists, and and or lists. and words have double quotes around them
    and_queries = []
    or_queries = []
    for query in queries:
        if query[0] == '"' and query[-1] == '"':
            and_queries.append(query[1:-1])
        else:
            or_queries.append(query)
    if len(and_queries) == 0:
        return multi_query(or_queries)
    if len(or_queries) == 0:
        return multi_query(and_queries, _and=True)
    and_results = multi_query(and_queries, _and=True)
    or_results = multi_query(or_queries)
    return sorted(list(set(and_results) & set(or_results)))

In [73]:
engine_step_one(["\"acci*t\"", "bodily"])

[1,
 3,
 4,
 5,
 8,
 10,
 11,
 14,
 15,
 16,
 22,
 24,
 25,
 27,
 29,
 33,
 67,
 92,
 96,
 101,
 121,
 127,
 128,
 130,
 145,
 287,
 288,
 364,
 469,
 568,
 629,
 634,
 635,
 687,
 707,
 709,
 726,
 733,
 734,
 890,
 911,
 921,
 922,
 923,
 925,
 928,
 936,
 937,
 1032,
 1033,
 1047,
 1094,
 1097,
 1112,
 1130,
 1144,
 1146,
 1163,
 1164,
 1171,
 1292,
 1338,
 1339,
 1350,
 1351,
 1719,
 2271,
 2474,
 2492,
 2495,
 2599,
 2636]

In [74]:
len(main_df)

3241

In [75]:
main_df.iloc[1]

document_name                                        7thEditionPolicy
page_number                                                         1
paragraph_number                                                    0
text                 \nContents     \nIntroduction \n1   \nDefinit...
tokenized           contents introduction definition agreement com...
posting_list        [accident, agreement, auto, bodily, cancellati...
Name: 1, dtype: object

In [76]:
def make_n_word_index(df):
    n_word_index = {}
    for row in df.iterrows():
        text = str(row[1]["tokenized"])
        text = text.split()
        for i in range(len(text) - 1):
            n_word = text[i] + " " + text[i+1]
            if n_word not in n_word_index:
                n_word_index[n_word] = LinkedList()
            n_word_index[n_word].append(row[0])
    return n_word_index

n_word_index = make_n_word_index(main_df)

In [77]:
for key in n_word_index:
    n_word_index[key].sort()

In [78]:
def query_n_word_index(query, n_word_index):
    result = []
    for id in n_word_index[query]:
        result.append(id.data)
    return result

In [79]:
def phrase_query(query, n_word_index):
    words = query.split()
    biwords = []
    for i in range(len(words) - 1):
        biwords.append(words[i] + " " + words[i+1])
    result = []
    for bw in biwords:
        result.append(query_n_word_index(bw, n_word_index))
    final = set(result[0])
    for l in result:
        final = final.intersection(set(l))
    return sorted(list(final))
   

In [80]:
phrase_query("liability policy sue", n_word_index)

[85]

In [135]:
    
def get_term_frequency_scores(df, query, inverted_list, perm_index, rev_perm_index):
    scores = []
    for row in df.iterrows():
        text = str(row[1]["tokenized"])
        text = text.split()
        score = 0
        for q in query:
            if q in text:
                if "*" not in q:
                    doc_freq = len(inverted_list[q]) + 1

                    score += (1 + np.log10(text.count(q)))*(np.log10(len(df)/doc_freq))
                else:
                    if q[-1] == "*":
                        left_result = left_permuterm_indexing(q, perm_index)
                        for word in left_result:
                            doc_freq = len(inverted_list[word]) + 1
                            score += (1 + np.log10(text.count(word)))*(np.log10(len(df)/doc_freq))
                    elif q[0] == "*":
                        right_result = right_permuterm_indexing(q, rev_perm_index)
                        for word in right_result:
                            doc_freq = len(inverted_list[word]) + 1
                            score += (1 + np.log10(text.count(word)))*(np.log10(len(df)/doc_freq))
                    else:
                        halves = q.split("*")
                        left_result = left_permuterm_indexing(halves[0] + "*", perm_index)
                        right_result = right_permuterm_indexing("*" + halves[-1], rev_perm_index)
                        result = list(set(left_result) & set(right_result))
                        for word in result:
                            doc_freq = len(inverted_list[word]) + 1
                            score += (1 + np.log10(text.count(word)))*(np.log10(len(df)/doc_freq))
                
        scores.append((row[0], score))
    return sorted(scores, key=lambda x: x[1], reverse=True)
        

In [129]:
docs = engine_step_one(["\"acci*t\"", "bodily", "harm"])

In [130]:
mid_df = main_df.iloc[docs]

In [131]:
mid_inverted_list = create_inverted_list(mid_df)
mid_perm_index = permuterm_indexing(mid_inverted_list)
mid_rev_perm_index = reverse_permuterm_indexing(mid_perm_index)

In [132]:
mid_df

Unnamed: 0,document_name,page_number,paragraph_number,text,tokenized,posting_list
1,7thEditionPolicy,1,0,\nContents \nIntroduction \n1 \nDefinit...,contents introduction definition agreement com...,"[accident, agreement, auto, bodily, cancellati..."
3,7thEditionPolicy,3,0,\n2 \nDefinitions Throughout this policy: \n...,definition policy refer company issue policy r...,"[accident, additional, arise, ask, atv, auto, ..."
4,7thEditionPolicy,4,0,3 unless such use is incidental to your bus...,use incidental business instal maintain repair...,"[accident, accidental, adoption, agent, agree,..."
5,7thEditionPolicy,5,0,\n4 \nCompulsory \nInsurance \nThere are four...,compulsory insurance part compulsory insurance...,"[access, accident, amount, apply, authorize, a..."
8,7thEditionPolicy,8,0,\nCompulsory Insurance (Continued) \n7\nW...,compulsory insurance continued pay pip benefit...,"[accident, agree, alcohol, arrest, auto, avoid..."
...,...,...,...,...,...,...
2474,eSols Property Owners Commercial Policy Wordin...,1,0,Pen Underwriting Limited is authorised and reg...,pen underwriting limited authorise regulate fi...,"[accident, accordingly, adjuster, admit, advic..."
2492,eSols Property Owners Commercial Policy Wordin...,19,0,Pen Underwriting Limited is authorised and reg...,pen underwriting limited authorise regulate fi...,"[accident, accidental, act, action, agree, aid..."
2495,eSols Property Owners Commercial Policy Wordin...,22,0,Pen Underwriting Limited is authorised and reg...,pen underwriting limited authorise regulate fi...,"[accident, accusation, aid, alleged, alter, am..."
2599,Residential-Property-Owners-Policy-Wording-1910,39,0,39\nArch Residential Property Owners 1910 g. \...,arch residential property owners tool die cut ...,"[accident, accuracy, action, agency, ammonia, ..."


In [144]:
get_term_frequency_scores(mid_df, ["\"acci*t\"", "bodily"], mid_inverted_list, mid_perm_index, mid_rev_perm_index)

[(24, 3.4269605488201202),
 (15, 3.155552199515261),
 (890, 3.155552199515261),
 (1, 2.975558083125821),
 (5, 2.975558083125821),
 (92, 2.975558083125821),
 (922, 2.975558083125821),
 (923, 2.975558083125821),
 (925, 2.975558083125821),
 (1339, 2.975558083125821),
 (1351, 2.975558083125821),
 (2495, 2.975558083125821),
 (14, 2.743505307560158),
 (22, 2.743505307560158),
 (27, 2.743505307560158),
 (128, 2.743505307560158),
 (921, 2.743505307560158),
 (1032, 2.743505307560158),
 (1338, 2.743505307560158),
 (1350, 2.743505307560158),
 (8, 2.4164452897785447),
 (11, 2.4164452897785447),
 (16, 2.4164452897785447),
 (25, 2.4164452897785447),
 (29, 2.4164452897785447),
 (33, 2.4164452897785447),
 (145, 2.4164452897785447),
 (687, 2.4164452897785447),
 (911, 2.4164452897785447),
 (928, 2.4164452897785447),
 (1130, 2.4164452897785447),
 (1163, 2.4164452897785447),
 (1719, 2.4164452897785447),
 (2271, 2.4164452897785447),
 (2474, 2.4164452897785447),
 (2492, 2.4164452897785447),
 (3, 1.857332496

In [142]:
print(main_df.iloc[936].text)

 
Effective (2016-06-01) 
FSCO (1215E.2) 
© Queen's Printer for Ontario, 2016 
(OAP 1) Owner’s Policy 
Page 43   
Example  We will not pay for a tire blow-out in normal driving, but if the tire is destroyed 
in a collision and you have Collision or Upset Coverage, we will cover that loss 
up to the value of your tire at the time of the incident.  We won't pay for loss or damage:   resulting from a dishonest claim of ownership, illegal disposal, or theft of the 
automobile by anyone who has legal possession of it under a written agreement (a 
mortgage, conditional sale, lease or other similar agreement);   resulting from a change in ownership that is agreed to, even if that change was brought 
about by trickery or fraud;  Example  Late one evening at a party, you sell your car to a stranger in return for a 
cheque. A week later the cheque bounces. We will not cover the loss.   caused by radioactive contamination;   to contents of automobiles and trailers, other than their equipment;

In [None]:
print(main_df.iloc[22].text)