In [89]:
# Import the required libraries to perform word2vec encodings
from gensim.models import Word2Vec
from gensim.models import Phrases
import multiprocessing

In [90]:
# Load the datasets 
# Taken From: 
# https://sites.google.com/site/daehpark/Resources/data-set-for-query-auto-completion-sigir-2017 

# @inproceedings{park2017neural,   
#                title={A neural language model for query auto-completion},
#                author={Park, Dae Hoon and Chiba, Rikio},
#                booktitle={Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
#                pages={1189--1192},
#                year={2017},
#                organization={ACM} }

def load_dataset(f: str, sep: str='\t') -> dict: 
    """Loads the dataset from a file into memory.

    Args:
        f (str): the file to load the dataset from
        sep (str, optional): the seperator to use to delimit the file. Defaults to '\t'.

    Returns:
        dict: a dictionary of the QAC data loaded into the memory.
    """
    data = { }  # dictionary with final query : prefixes
    with open(f, 'r') as f:
        for line in f.readlines():
            item = line.split(sep)
            partial, full = item[0].strip(), item[1].strip()
            if full in data: 
                data[full].append(partial)
            else:
                data[full] = [partial]
    return data

qac_train = load_dataset('qac_data/qac_training.tsv')
qac_val = load_dataset('qac_data/qac_validation.tsv')
qac_test = load_dataset('qac_data/qac_test.tsv')

print('Lengths:','\ntrain',len(qac_train), '\nvalidation',len(qac_val), '\ntest', len(qac_test))

Lengths: 
train 5121 
validation 2169 
test 2227


In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model_size100 = Word2Vec(vector_size=100, min_count=1, workers=cores-1)  # create a word2vec model with vector size 100

vocab = [prefixes + query.strip().split(' ') + [query] for query, prefixes in qac_train.items()]

w2v_model_size100.build_vocab(vocab)  # build the vocab for the number of sentences
w2v_model_size100.train(vocab, total_examples=len(vocab), epochs=100, report_delay=1)  # train the word2vec model


(9515382, 9698800)

In [314]:
# Now using the word2vec model we can predict autocompletion of words in the vocab
def predict(partial_q: str, model, context:str=None, k:int=5) -> list:
    results = []
    
    for sentence in qac_train:  
        # check the cosine similarity of the query and output
        contexted_q = partial_q
        if context is not None:
            # try to see if the contexted q is in the vocab
            try: 
                model.wv[context + partial_q]
                contexted_q = context + partial_q
            except: 
                contexted_q = partial_q
                
            try: 
                model.wv[partial_q + context]
                contexted_q = partial_q + context
            except: 
                contexted_q = partial_q
            
        # then check similarity
        sim = model.wv.similarity(sentence, contexted_q)
        results.append((sentence, sim))

    if k is None:
        output = sorted(results, key=lambda item: item[1], reverse=True)
    else:
        output = sorted(results, key=lambda item: item[1], reverse=True)[:k]
        
    return output

predict('hotel', w2v_model_size100)

[('ritz hotel', 0.9652626),
 ('straithaven hotel', 0.9553758),
 ('iriquois hotel', 0.95321584),
 ('hilton hotel kingston jamaica', 0.94994247),
 ('hotel 71 chicago il', 0.9481164)]