In [46]:
# Import libraries
import numpy as np

In [None]:
# Load the datasets 
# Taken From: 
# https://sites.google.com/site/daehpark/Resources/data-set-for-query-auto-completion-sigir-2017 

# @inproceedings{park2017neural,   
#                title={A neural language model for query auto-completion},
#                author={Park, Dae Hoon and Chiba, Rikio},
#                booktitle={Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
#                pages={1189--1192},
#                year={2017},
#                organization={ACM} }

def load_dataset(f: str, sep: str='\t') -> dict: 
    """Loads the dataset from a file into memory.

    Args:
        f (str): the file to load the dataset from
        sep (str, optional): the seperator to use to delimit the file. Defaults to '\t'.

    Returns:
        dict: a dictionary of the QAC data loaded into the memory.
    """
    data = { }  # dictionary with final query : prefixes
    with open(f, 'r') as f:
        for line in f.readlines():
            item = line.split(sep)
            partial, full = item[0].strip(), item[1].strip()
            if full in data: 
                data[full].append(partial)
            else:
                data[full] = [partial]
    return data

qac_train = load_dataset('qac_data/qac_training.tsv')
qac_val = load_dataset('qac_data/qac_validation.tsv')
qac_test = load_dataset('qac_data/qac_test.tsv')

print('Lengths:','\ntrain',len(qac_train), '\nvalidation',len(qac_val), '\ntest', len(qac_test))

Lengths: 
train 5121 
validation 2169 
test 2227


In [None]:
# Use bag of words to retrieve queries 
# For the bag of words model all the full queries are our corpus 
# Therefore we need to compute the similarity of words between the words in the partial query and the full corpus to determine 
# what possible queries to generate. 
#   - try without removing stopwords

def generate_bag_of_words(dataset: dict) -> dict: 
    """Generates a bag of words per each query, using the partial queries to generate the bag of words. 
       The bag of words per query will emphasize the key words required for that query based off the partial queries 
       that should obtain the same results as the original query. 
    
    Args:
        dataset (dict): the dataset to generate a bag of words for.

    Returns:
        dict: the dictionary of queries with their corresponding bag of words.
    """
    bag = {} 
    for query, partials in dataset.items(): 
        for partial in partials:
            tokens = partial.split(" ")
            if query not in bag:
                bag[query] = {} 
            for token in tokens: 
                if token in bag[query]:
                    bag[query][token] += 1
                else: 
                    bag[query][token] = 1
                
    return bag

bag_of_words = generate_bag_of_words(qac_train)
print(list(bag_of_words.items())[:5])

[('unique marine', {'unique': 7, 'm': 1, 'ma': 1, 'mar': 1, 'mari': 1, 'marin': 1, 'marine': 1}), ('hampton hotel york pa', {'hampton': 14, 'h': 1, 'ho': 1, 'hot': 1, 'hote': 1, 'hotel': 9, 'y': 1, 'yo': 1, 'yor': 1, 'york': 4, 'p': 1, 'pa': 1}), ('laura banks', {'laura': 6, 'b': 1, 'ba': 1, 'ban': 1, 'bank': 1, 'banks': 1}), ('lightspeed girls barey 19', {'lightspeed': 15, 'g': 1, 'gi': 1, 'gir': 1, 'girl': 1, 'girls': 10, 'b': 1, 'ba': 1, 'bar': 1, 'bare': 1, 'barey': 4, '1': 1, '19': 1}), ('golden nugget las vegas', {'golden': 17, 'n': 1, 'nu': 1, 'nug': 1, 'nugg': 1, 'nugge': 1, 'nugget': 11, 'l': 1, 'la': 1, 'las': 7, 'v': 1, 've': 1, 'veg': 1, 'vega': 1, 'vegas': 1})]


In [None]:
# Now given the full queroes bag of words we now predict the partial query
def predict(partial_q: str, bag_of_words: dict, context:str=None, k:int=5) -> list:
    """Predicts what query the partial question provided should belong to

    Args:
        partial_q (str): the partial query inputted
        bag_of_words (dict): the bag of words associated with the partial query
        context (str, optional): the context provided before the partial query. Defaults to None.
        k (int, optional): the number of best results to return. Defaults to 5.

    Returns:
        list : of the k number of best querries to autocomplete to with their associated tf-idf scores.
    """
    probabilities = {}
    
    # the tokens are a list of the full words in the partial query as well 
    # as the characters in the partial query, as the bag of words was generated with 
    # partial queries. 
    tokens = []
    if context is not None:
        tokens += context.strip().split(" ")
    tokens += partial_q.strip().split(' ') + list(partial_q)
    for query, bag in bag_of_words.items(): 
        probabilities[query] = 0
        tokens_in_query = sum(bag.values())
        # Uses tf-idf to calculate the probabilities.
        for token in tokens: 
            if token in bag:
                tf = bag[token] / tokens_in_query
            else:
                tf = 0
            idf = np.log2(len(bag_of_words) / len(list(filter(lambda bag: token in bag, bag_of_words.values()))))
            probabilities[query] += tf * idf
    
    # returns the probabilities in descending order (highest first)
    sorted_probs = sorted(probabilities.items(), key=lambda item: item[1], reverse=True)
    if k is not None: 
        result = sorted_probs[:k]
    else:
        result = sorted_probs
    return result

predict("hotel", bag_of_words)

[('hotel jobs', np.float64(3.9960704593355154)),
 ('hotel descondido', np.float64(3.7677235759449146)),
 ('hotel kranenturm', np.float64(3.7677235759449146)),
 ('hotel l europe amsterdam', np.float64(3.3754226184472573)),
 ('h l h o h s d j', np.float64(3.1822798397639285))]