In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import files

!pip install inverted-index

Mounted at /content/gdrive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting inverted-index
  Downloading inverted_index-0.1.1-py3-none-any.whl (5.0 kB)
Installing collected packages: inverted-index
Successfully installed inverted-index-0.1.1


In [90]:
import pandas as pd
import glob
import os
import re
from collections import OrderedDict

from prompt_toolkit import prompt
from prompt_toolkit.completion import Completer, Completion

In [3]:
directory_name = "gdrive/MyDrive/Colab Notebooks/GESTALT/"
objects_file = "data/output/ownershipAssignment/DBSCAN_PredictedLocations.csv"
filename = glob.glob(os.path.join(directory_name, objects_file))[0]

In [6]:
def normalize_object_term(term):
    # TODO implement fuzzy term matching with OSM API
    return term

In [196]:
class InvertedIndex:
    """
    """
    def __init__(self, filename):
        self.df = pd.read_csv(filename)
        self.objects_df = self.df[['name','predicted_location']]
        self.objects_df_grouped = self.objects_df.groupby(self.objects_df.columns.tolist(),as_index=False).size()
        self.objects_df_grouped = self.objects_df_grouped.rename(columns={'size':'num_occurences'})  # this puts object counts in there, but we drop it for now
        self.objects_df_grouped_list = pd.DataFrame(self.objects_df_grouped.groupby('name')['predicted_location'].apply(list))
        self.__make_ii__()
        self.__make_adj__()

    def __make_ii__(self):
        self.ii = dict()
        self.ii_counter = dict()
        for idx, row in self.objects_df_grouped_list.iterrows():
            self.ii[idx] = set(row['predicted_location'])
            self.ii_counter[idx] = len(set(row['predicted_location']))

    def most_discriminative(self, query_terms):
        best_qt = query_terms[0]
        for qt in query_terms:
            if self.ii_counter[qt] < self.ii_counter[best_qt]:
                best_qt = qt
        return best_qt

    def __make_adj__(self):
        self.df['assignment_prob'] = 1.0  # TODO pull real one

        self.df['prob'] = self.df['class_confidence'] * self.df['assignment_prob']  # TODO col name
        self.adj = self.df.groupby(['name','predicted_location'], as_index=False).agg({'prob':'max'})
        return self.adj


    def get_prob(self, Loc : str, Obj : str):
        return self.adj[(self.adj['name'] == Obj) & (self.adj['predicted_location'] == Loc)]['prob'].item()


    def rank(self, Locs : set, Objs : set):
        loc_ranks = OrderedDict()
        for L in Locs:
            p = 1
            for Ob in Objs:
                p *= self.get_prob(L, Ob)
            loc_ranks[L] = p
        return sorted(loc_ranks, key=loc_ranks.get, reverse=True)


    def search(self, query_terms : list):
        set_list = [self.ii[normalize_object_term(x)] for x in query_terms]
        return self.rank(set.intersection(*set_list), query_terms)

    def fuzzy_search(self, query_terms : list):
        # Assume inverted index of object_class -> {Locations} exists
        s = self.search(query_terms)
        if len(s) == 0:
            q = []
            while True:  # TODO make this cleaner and check end conditions
                best_qt = self.most_discriminative(query_terms)
                q.append(best_qt)
                query_terms.remove(best_qt)
                s = self.search(q)
                if len(s) > 0:
                    continue
                else:  # went too far, adding one too many search terms
                    q.pop()  # Remove most recent
                    s = self.search(q)  # Rerun to get nonempty result
                    query_terms = q
                    break
        if len(s) > 1:
            self.rank(s, query_terms)
        return s, query_terms  # returns query_terms since they may be a reduces subset of original query terms

In [198]:
inverted_index = InvertedIndex(filename)

print("TEST APPLE WINE_BARRELl")
print(inverted_index.search(['apple','wine_barrell']))
print(inverted_index.fuzzy_search(['apple','wine_barrell']))
print('\n')

print("TEST FORK BENCH BIRD")
print(inverted_index.search(['fork','bench','bird']))
print(inverted_index.fuzzy_search(['bench','fork','bird']))
print('\n')

print("TEST BENCH BIRD")
print(inverted_index.search(['bench','bird']))
print(inverted_index.fuzzy_search(['bench','bird']))
print('\n')

print("TEST BIRD")
print(inverted_index.search(['bird']))
print(inverted_index.fuzzy_search(['bird']))

TEST APPLE WINE_BARRELl
['Lancaster Wines']
(['Lancaster Wines'], ['apple', 'wine_barrell'])


TEST FORK BENCH BIRD
[]
(['MAX Employment Midland'], ['fork'])


TEST BENCH BIRD
['Guildford', 'Sandalford Winery', 'Toilets']
(['Guildford', 'Sandalford Winery', 'Toilets'], ['bench', 'bird'])


TEST BIRD
['Guildford', 'Sandalford Winery', "Nando's", 'Midland', 'Midvale', 'Guildford Old Courthouse', 'Toilets']
(['Guildford', 'Sandalford Winery', "Nando's", 'Midland', 'Midvale', 'Guildford Old Courthouse', 'Toilets'], ['bird'])


In [174]:
display(inverted_index.__make_adj__())

Unnamed: 0,name,predicted_location,class_confidence,assignment_prob,prob
0,palm_tree,Ali's Vineyard,1.0,1.0,1.0
1,palm_tree,Ali's Vineyard,1.0,1.0,1.0
2,palm_tree,Ali's Vineyard,1.0,1.0,1.0
3,palm_tree,Ali's Vineyard,1.0,1.0,1.0
4,palm_tree,Ali's Vineyard,1.0,1.0,1.0
...,...,...,...,...,...
3038,crossing,Morrina Australia Pty Ltd,0.7,1.0,0.7
3039,crossing,Morrina Australia Pty Ltd,0.7,1.0,0.7
3040,crossing,,0.7,1.0,0.7
3041,crossing,,0.7,1.0,0.7


Unnamed: 0,name,predicted_location,prob
0,air_conditioner,Ali's Vineyard,1.00
1,air_conitioner,Little River Winery and Café,1.00
2,airplane,Guildford,0.96
3,airplane,Guildford Old Courthouse,0.96
4,airplane,Midland Workshops Museum,0.50
...,...,...,...
365,warehouse,Oakover Grounds,1.00
366,water_tank,Little River Winery and Café,1.00
367,wine glass,MAX Employment Midland,0.28
368,wine glass,Sandalford Winery,0.28


In [115]:
class SearchCompleter(Completer):
    def __init__(self, inverted_index: InvertedIndex):
        self.inverted_index = inverted_index

    def get_completions(self, document, complete_event):
        if complete_event.completion_requested:
            for match in self.inverted_index.search(document.text):
                yield Completion(match.ljust(document.cursor_position), start_position=-document.cursor_position)

In [None]:
user_input = prompt("> ", completer=SearchCompleter(ii), complete_while_typing=False)