In [1]:
# importing libraries
import pandas as pd
import numpy as np
import glob
import sys
import spacy
from LinkedList import LinkedList

nlp = spacy.load('en_core_web_sm')
sys.setrecursionlimit(10000)

In [2]:
auto_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/auto.csv')
property_df = pd.read_csv('/home/majime/programming/github/information-retrieval-assignments/assignment 1/tokenized/property.csv')

In [3]:
def create_postings_list(x):
    x = str(x)
    posting_list = set()
    for word in x.split():
        posting_list.add(word.lower())
    posting_list = list(posting_list)
    # remove strings with only punctuations
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~=+'''
    for word in posting_list:
        if word in punctuations:
            posting_list.remove(word)
    return sorted((posting_list))

auto_df['posting_list'] = auto_df['tokenized'].apply(create_postings_list)
property_df['posting_list'] = property_df['tokenized'].apply(create_postings_list)

In [4]:
def and_query(df, s: list):
    for word in s:
        word = nlp(word)[0].lemma_.lower()
        df = df[df["posting_list"].apply(lambda x: word in x)]
    return df

In [5]:
def or_query(df, s: list):
    union_df = pd.DataFrame(columns = df.columns)
    for word in s:
        word = nlp(word)[0].lemma_.lower()
        union_df = pd.concat([union_df, df[df["posting_list"].apply(lambda x: word in x)]])
    return union_df

In [6]:
main_df = pd.concat([auto_df, property_df])
corpus = set()
for l in main_df.posting_list:
    for word in l:
        corpus.add(word)
corpus = sorted(list(corpus))

In [7]:
def create_inverted_list(df):
    inverted_list = {}
    for word in corpus:
        inverted_list[word] = LinkedList()
    for row in df.iterrows():
        l = row[1]["posting_list"]
        for word in l:
            inverted_list[word].append(row[0])
    for word in inverted_list:
        inverted_list[word].sort()
    return inverted_list

inverted_list = create_inverted_list(main_df)

In [10]:
def get_all_rotations(s):
    rotations = []
    for i in range(len(s)):
        rotations.append(s[i:] + s[:i])
    return rotations

In [11]:

def permuterm_indexing(inv_list):
    perm_index = {}
    for word in inv_list:
        word_perm = word + "$"
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in perm_index:
                perm_index[q] = LinkedList()
            perm_index[q].append(word)
    return perm_index

    
perm_index = permuterm_indexing(inverted_list) 

In [106]:
def reverse_permuterm_indexing(inv_list):
    rev_perm_index = {}
    for word in inv_list:
        word_perm = "$" + word
        word_perm = word_perm[::-1] 
        rotations = get_all_rotations(word_perm)
        for rotation in rotations:
            q = rotation.split("$")[-1]
            if q not in rev_perm_index:
                rev_perm_index[q] = LinkedList()
            rev_perm_index[q].append(word)
    return rev_perm_index

rev_perm_index = reverse_permuterm_indexing(inverted_list)

In [121]:
def query_permuterm_index(query, perm_index, rev_perm_index):
    if "*" in query:
        if query[-1] == "*":
            query = query + "$"
            rotations = get_all_rotations(query)
            result = []
            for rotation in rotations:
                if rotation[0] == "*":
                    q = rotation[2:]
                    if q in perm_index:
                        for word in perm_index[q]:
                            result.append(word.data)
        elif query[0] == "*":
            query = "$" + query
            query = query[::-1]
            print(query)
            rotations = get_all_rotations(query)
            result = []
            for rotation in rotations:
                if rotation[0] == "*":
                    q = rotation[2:]
                    if q in rev_perm_index:
                        for word in rev_perm_index[q]:
                            result.append(word.data)
                            
        else:
            halves = query.split("*")
            left = halves[0] + "*" + "$"
            rotations = get_all_rotations(left)
            left_result = []
            for rotation in rotations:
                if rotation[0] == "*":
                    q = rotation[2:]
                    if q in perm_index:
                        for word in perm_index[q]:
                            left_result.append(word.data)
            right = "$" + "*" + halves[1]
            right = right[::-1]
            rotations = get_all_rotations(right)
            right_result = []
            for rotation in rotations:
                if rotation[0] == "*":
                    q = rotation[2:]
                    if q in rev_perm_index:
                        for word in rev_perm_index[q]:
                           right_result.append(word.data)
            result = list(set(left_result) & set(right_result))
            
    return result

In [122]:
trial = query_permuterm_index("g*ed", perm_index, rev_perm_index)

In [123]:
trial

['glazed',
 'garaged',
 'gathered',
 'gained',
 'generated',
 'gaged',
 'governed',
 'granted',
 'guaranteed']