In [1]:
# importing libraries
import pandas as pd
import numpy as np
import glob
import sys
import spacy
from LinkedList import LinkedList

nlp = spacy.load('en_core_web_sm')
sys.setrecursionlimit(10000)

In [2]:
auto_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/auto.csv')
property_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/property.csv')

In [3]:
def create_postings_list(x):
    x = str(x)
    posting_list = set()
    for word in x.split():
        posting_list.add(word.lower())
    posting_list = list(posting_list)
    # remove strings with only punctuations
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~=+'''
    for word in posting_list:
        if word in punctuations:
            posting_list.remove(word)
    return sorted((posting_list))

auto_df['posting_list'] = auto_df['tokenized'].apply(create_postings_list)
property_df['posting_list'] = property_df['tokenized'].apply(create_postings_list)

In [4]:
main_df = pd.concat([auto_df, property_df])
main_df = main_df.reset_index(drop=True)
corpus = set()
for l in main_df.posting_list:
    for word in l:
        corpus.add(word)
corpus = sorted(list(corpus))

In [5]:
def create_inverted_list(df):
    inverted_list = {}
    for word in corpus:
        inverted_list[word] = LinkedList()
    for row in df.iterrows():
        l = row[1]["posting_list"]
        for word in l:
            inverted_list[word].append(row[0])
    for word in inverted_list:
        inverted_list[word].sort()
    return inverted_list

inverted_list = create_inverted_list(main_df)

In [6]:
def get_all_rotations(s):
    rotations = []
    for i in range(len(s)):
        rotations.append(s[i:] + s[:i])
    return rotations

In [7]:

def permuterm_indexing(inv_list):
    perm_index = {}
    for word in inv_list:
        word_perm = word + "$"
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in perm_index:
                perm_index[q] = LinkedList()
            perm_index[q].append(word)
    return perm_index

    
perm_index = permuterm_indexing(inverted_list) 

In [8]:
def reverse_permuterm_indexing(inv_list):
    rev_perm_index = {}
    for word in inv_list:
        word_perm = "$" + word
        word_perm = word_perm[::-1] 
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in rev_perm_index:
                rev_perm_index[q] = LinkedList()
            rev_perm_index[q].append(word)
    return rev_perm_index

rev_perm_index = reverse_permuterm_indexing(inverted_list)

In [9]:
def left_permuterm_indexing(query, perm_index):
    result = []
    query = query + "$"
    rotations = get_all_rotations(query)
    for rotation in rotations:
        if rotation[0] == "*":
            q = rotation[2:]
            if q in perm_index:
                for word in perm_index[q]:
                    result.append(word.data)
    return result
    
def right_permuterm_indexing(query, rev_perm_index):
    result = []
    query = "$" + query
    query = query[::-1]
    rotations = get_all_rotations(query)
    for rotation in rotations:
        if rotation[0] == "*":
            q = rotation[2:]
            if q in rev_perm_index:
                for word in rev_perm_index[q]:
                    result.append(word.data)
    return result

def query_permuterm_index(query, perm_index, rev_perm_index, inv_list):
    result = []
    if "*" in query:
        if query[-1] == "*":
            result = left_permuterm_indexing(query, perm_index)
        elif query[0] == "*":
            result = right_permuterm_indexing(query, rev_perm_index)
                            
        else:
            halves = query.split("*")
            left_result = left_permuterm_indexing(halves[0] + "*", perm_index)
            right_result = right_permuterm_indexing("*" + halves[-1], rev_perm_index)
            result = list(set(left_result) & set(right_result))
            
    docs = []
    for word in result:
        for id in inv_list[word]:
            docs.append(id.data)    
    return sorted(list(set(docs)))


In [10]:
trial = query_permuterm_index("g*l*e", perm_index, rev_perm_index, inverted_list)

In [11]:
trial

[37,
 45,
 50,
 87,
 102,
 105,
 126,
 128,
 129,
 131,
 135,
 137,
 138,
 141,
 147,
 148,
 149,
 156,
 157,
 159,
 161,
 165,
 166,
 175,
 212,
 236,
 243,
 245,
 251,
 258,
 271,
 394,
 399,
 453,
 454,
 495,
 498,
 509,
 620,
 724,
 741,
 815,
 817,
 825,
 848,
 858,
 859,
 860,
 861,
 870,
 874,
 878,
 881,
 896,
 899,
 902,
 903,
 904,
 908,
 978,
 979,
 981,
 1023,
 1036,
 1050,
 1160,
 1245,
 1246,
 1353,
 1395,
 1447,
 1523,
 1620,
 1734,
 1812,
 1861,
 1903,
 1908,
 1931,
 1979,
 1984,
 2038,
 2065,
 2083,
 2102,
 2205,
 2206,
 2454,
 2468,
 2475,
 2507,
 2531,
 2609,
 2622,
 2761,
 2763,
 2835,
 2839,
 2842,
 2843,
 2849,
 2850,
 2851,
 2855,
 2856,
 2865,
 2868,
 2873,
 2877,
 2880,
 2883,
 2885,
 2888,
 2890,
 2891,
 2894,
 2897,
 2899,
 2900,
 2901,
 2927,
 2931,
 2938,
 2953,
 3023,
 3026,
 3030,
 3032,
 3033,
 3035,
 3036,
 3039,
 3040,
 3041,
 3042,
 3043,
 3044,
 3048,
 3049,
 3059,
 3060,
 3067,
 3069,
 3093,
 3119,
 3168,
 3172,
 3224,
 3369,
 3386,
 3403,
 3425,
 3

In [12]:
def multi_query(queries, _and=False):
    docs = []
    for query in queries:
        if "*" in query:
            docs.append(query_permuterm_index(query, perm_index, rev_perm_index, inverted_list))
            
        else:
            intermediate_docs = []
            for id in inverted_list[query]:
                intermediate_docs.append(id.data)
            docs.append(intermediate_docs)
    # return docs
    if not _and:   
        # return union of all sublists in docs
        result = []
        for l in docs:
            for id in l:
                if id not in result:
                    result.append(id)
        return sorted(result)
    else:
        result = set(docs[0])
        for l in docs:
            result = result.intersection(set(l))
        return sorted(list(result))

In [13]:
results = multi_query(["g*e", "car"], _and=True)

In [14]:
results

[126,
 128,
 156,
 159,
 394,
 724,
 1908,
 2531,
 2839,
 2850,
 2938,
 3042,
 3067,
 3224,
 3736,
 3824,
 4223,
 4427,
 4681,
 4690]

In [15]:
def engine_step_one(queries):
    # seperate queries into two lists, and and or lists. and words have double quotes around them
    and_queries = []
    or_queries = []
    for query in queries:
        if query[0] == '"' and query[-1] == '"':
            and_queries.append(query[1:-1])
        else:
            or_queries.append(query)
    if len(and_queries) == 0:
        return multi_query(or_queries)
    if len(or_queries) == 0:
        return multi_query(and_queries, _and=True)
    and_results = multi_query(and_queries, _and=True)
    or_results = multi_query(or_queries)
    return sorted(list(set(and_results) & set(or_results)))

In [16]:
engine_step_one(["\"acci*t\"", "bodily"])

[22,
 25,
 28,
 37,
 47,
 54,
 55,
 57,
 60,
 80,
 87,
 88,
 95,
 102,
 125,
 128,
 129,
 130,
 138,
 139,
 141,
 142,
 147,
 148,
 155,
 156,
 157,
 158,
 159,
 160,
 177,
 475,
 478,
 497,
 522,
 676,
 897,
 898,
 899,
 900,
 901,
 902,
 903,
 904,
 905,
 906,
 975,
 1537,
 1723,
 1726,
 2872,
 2876,
 2877,
 2878,
 3028,
 3060,
 3937,
 4710,
 4714,
 4715]

In [17]:
main_df.iloc[3060].text

'USEFUL TELEPHONE NUMBERS \nHow to make a claim \nIf You need to make a legal expenses claim and this section is \nshown as being operative on the Schedule, please refer to the \nLegal Expenses Insurance Section of this Policy for details. For all \nother claims please contact Us by calling the telephone number \nprinted on Your Policy Schedule. \nThe claims handler will take full details of the claim and guide You \nthrough the next steps. Depending on the value and type of claim, \nthe claims handler may seek help from a loss adjuster. Loss \nadjusters are independent claims experts who will visit You or a \nthird party claimant to assist with the assessment of the claim. \nOnce We have been notified of a claim, We will tell Your broker. \nThe notification letter gives Your broker the opportunity to become \ninvolved in the claim if either You or they wish. Once the claim has \nbeen settled, a letter is sent to Your broker confirming settlement \nand the amounts paid. \nDo \n• Have d

In [18]:
main_df.iloc[1]

document_name                                        7thEditionPolicy
page_number                                                         0
paragraph_number                                                    1
text                Please read your policy.  Part of the policy i...
tokenized           read policy policy page mark coverage selectio...
posting_list        [agent, amount, approve, away, buy, check, com...
Name: 1, dtype: object

In [19]:
def make_n_word_index(df):
    n_word_index = {}
    for row in df.iterrows():
        text = str(row[1]["tokenized"])
        text = text.split()
        for i in range(len(text) - 1):
            n_word = text[i] + " " + text[i+1]
            if n_word not in n_word_index:
                n_word_index[n_word] = LinkedList()
            n_word_index[n_word].append(row[0])
    return n_word_index

n_word_index = make_n_word_index(main_df)

In [20]:
for key in n_word_index:
    n_word_index[key].sort()

In [21]:
def query_n_word_index(query, n_word_index):
    result = []
    for id in n_word_index[query]:
        result.append(id.data)
    return result

In [22]:
def phrase_query(query, n_word_index):
    words = query.split()
    biwords = []
    for i in range(len(words) - 1):
        biwords.append(words[i] + " " + words[i+1])
    result = []
    for bw in biwords:
        result.append(query_n_word_index(bw, n_word_index))
    final = set(result[0])
    for l in result:
        final = final.intersection(set(l))
    return sorted(list(final))
   

In [23]:
phrase_query("liability policy sue", n_word_index)

[127]