In [15]:
import numpy as np
import fasttext

from nltk import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors

  return f(*args, **kwds)


In [13]:
questions = [
    {
        'question': 'What is the weather today?',
        'paraphrases': [
            'I want to know the today\'s weather!',
            'Tell me the weather for today, please.'
        ],
        'action': lambda: 'weather',
        'embedding': None,
    },
    {
        'question': 'How old are you?',
        'paraphrases': [
            'What is your age?',
            'Tell me your age, please.'
        ],
        'action': lambda: 'age',
        'embedding': None,
    },
    {
        'question': 'Hello, I want to change my phone tariff',
        'paraphrases': [
            'My phone tariff is not good enough for me',
            'I don\'t like my current tariff'
        ],
        'action': lambda: 'tariff',
        'embedding': None,
    },
    {
        'question': 'Hi, I want to order a restaurant',
        'paraphrases': [
            'Hello, I want to order a taxi',
            'I would want to book a room'
        ],
        'action': lambda: 'ordering',
        'embedding': None,
    },
]

In [9]:
model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

Exception: fastText: Cannot load crawl-300d-2M-subword/crawl-300d-2M-subword.bin due to C++ extension failed to allocate the memory

In [9]:
def tokenize(text):
    tokenizer = WordPunctTokenizer()
    return list(map(str.lower, tokenizer.tokenize(text)))

In [10]:
def preprocess(text):
    return ' '.join(tokenize(text))

In [16]:
def make_vectorizer(questions):
    texts = [item
             for question in questions
             for item in (question['question'], *question['paraphrases'])]
    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize).fit(texts)
    return vectorizer

In [17]:
vectorizer = make_vectorizer(questions)

In [18]:
def transform(vectorizer, text):
    text = preprocess(text)
    return np.array(vectorizer.transform([text]).todense())

In [19]:
def get_word_weights(vectorizer, text):
    norm = transform(vectorizer, text)
    word2weight = {word: weight for word, weight in zip(vectorizer.get_feature_names(), norm.ravel())}
    return np.array([1 / len(text.split())] * len(text.split()))

In [20]:
get_word_weights(vectorizer, preprocess('Hello, I want to change my phone'))

array([0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])

In [47]:
def get_embedding(vectorizer, model, text):
    text = preprocess(text)
    weights = get_word_weights(vectorizer, text)
    embeddings = np.array([model.get_vector(word) for word in text.split()])
    return np.matmul(embeddings.T, weights).reshape(1, -1)

In [48]:
vec1 = get_embedding(vectorizer, model, 'Hello I want to change my phone')

vec2 = get_embedding(vectorizer, model, 'How can I change my phone')

vec3 = get_embedding(vectorizer, model, 'I want to break free')

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
cosine_similarity(vec1,  vec2)

array([[0.93526925]])

In [55]:
cosine_similarity(vec1,  vec3)

array([[0.8769267]])

In [66]:
def get_top_answer(vectorizer, model, questions, input_question):
    scores = []
    emb = get_embedding(vectorizer, model, input_question)
    for question in questions:
        score = cosine_similarity(emb, get_embedding(vectorizer, model, question['question']))
        score = max(score, *[cosine_similarity(emb, get_embedding(vectorizer, model, paraphrase))
                             for paraphrase in question['paraphrases']])
        scores.append(score)
    return questions[np.argmax(scores)]['action']

In [69]:
get_top_answer(vectorizer, model, questions, 'Hi, could you tell me the weather forecast for today?')()

'weather'

In [70]:
get_top_answer(vectorizer, model, questions, 'Are you a young person?')()

'age'

In [72]:
get_top_answer(vectorizer, model, questions, 'Hello, are you a phone company? Just want to change my current services')()

'tariff'