In [6]:
# Import required libraries
import numpy as np


In [7]:
# Load the datasets 
# Taken From: 
# https://sites.google.com/site/daehpark/Resources/data-set-for-query-auto-completion-sigir-2017 

# @inproceedings{park2017neural,   
#                title={A neural language model for query auto-completion},
#                author={Park, Dae Hoon and Chiba, Rikio},
#                booktitle={Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
#                pages={1189--1192},
#                year={2017},
#                organization={ACM} }

def load_dataset(f: str, sep: str='\t') -> dict: 
    """Loads the dataset from a file into memory.

    Args:
        f (str): the file to load the dataset from
        sep (str, optional): the seperator to use to delimit the file. Defaults to '\t'.

    Returns:
        dict: a dictionary of the QAC data loaded into the memory.
    """
    data = { }  # dictionary with final query : prefixes
    with open(f, 'r') as f:
        for line in f.readlines():
            item = line.split(sep)
            partial, full = item[0].strip(), item[1].strip()
            if full in data: 
                data[full].append(partial)
            else:
                data[full] = [partial]
    return data

qac_train = load_dataset('qac_data/qac_training.tsv')
qac_val = load_dataset('qac_data/qac_validation.tsv')
qac_test = load_dataset('qac_data/qac_test.tsv')

print('Lengths:','\ntrain',len(qac_train), '\nvalidation',len(qac_val), '\ntest', len(qac_test))

Lengths: 
train 5121 
validation 2169 
test 2227


In [8]:
# Create X,y to use for training the LGBM model
# Load the prefixes into X, and the results into y 
X = []
y = []
for key, prefixes in qac_train.items():
    to_add = set()
    for prefix in prefixes:
        to_add = to_add.union(set(prefix.split(' ')))
    X.append(list(to_add))
    y += [key]
    
# Generate the corpus to use 
corpus = {}
i = 0
for row in X: 
    for element in row:
        if element not in corpus:
            corpus[element] = i
            i+=1

# Encode X as a one hot vector
def encode_X(phi: list, corpus: dict) -> list:
    one_hot_X = []
    for row in phi: 
        one_hot = [0] * len(corpus)
        for element in row: 
            if element in corpus:
                one_hot[corpus[element]] = 1
        one_hot_X.append(one_hot)
    
    return np.array(one_hot_X)

one_hot_X = encode_X(X, corpus)
print(one_hot_X.shape)

# Enocde Y as a column vector
y = np.reshape(np.array(y), (-1, 1))
print(y.shape)


(5121, 21371)
(5121, 1)


In [11]:
# Now to predict just create a one hot vector of the elements
def predict(partial_q: str, corpus: dict, one_hot_X: np.array, y: np.array, context:str=None, k:int=5) -> list:
    tokens = []
    if context is not None:
        tokens += context.strip().split(' ')
    tokens += partial_q.strip().split(' ')
    
    one_hot_partial = encode_X([tokens], corpus)[0]
    # Now that the one hot partial vector is encoded check to see which vector best matches 
    results = []
    for row, label in zip(one_hot_X, y):
        results.append((label[0], np.dot(one_hot_partial, row)))
    
    # Now sort the results and obtain the best
    if k is None:
        return sorted(results, key=lambda item: item[1], reverse=True)
    else: 
        return sorted(results, key=lambda item: item[1], reverse=True)[:k]
    

predict("hotel", corpus, one_hot_X, y, k=10)

[('hampton hotel york pa', 1),
 ('hotel auxiliary aid dwarf', 1),
 ('lord nelson hotel halifax', 1),
 ('ritz hotel', 1),
 ('loew s hotels', 1),
 ('marriott hotels', 1),
 ('altanyic city nj hotels', 1),
 ('hotel 71 chicago il', 1),
 ('las vegas scheduled boxing dates orleans casino hotel', 1),
 ('cheap hotels los angeles', 1)]