#### Load required libs

In [1]:
import json

import nltk

from scipy.sparse import coo_matrix
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

### Load data

Dowload data from https://github.com/clinc/oos-eval/

In [2]:
f = open('../data/data_small.json')

data = json.load(f)

print(data.keys())

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])


In [3]:
data

   'oos'],
  ['a show on broadway', 'oos'],
  ['who has the best record in the nfl', 'oos'],
  ['how do i find the area of a circle', 'oos'],
  ['how many onions do i have on hand', 'oos'],
  ['what is the name of the 13th president', 'oos'],
  ['show me recent activity in my backyard', 'oos'],
  ['how long will it take me to pay off my card if i pay an extra $50 a month over the minimum',
   'oos'],
  ['does our bank have free notary', 'oos'],
  ['what were the top stories this week', 'oos'],
  ['can i mix antifreeze with water', 'oos'],
  ['are any earning reports due', 'oos'],
  ['show me the channel guide', 'oos'],
  ['is the pnc bank open', 'oos'],
  ['where does the power steering fluid go', 'oos'],
  ['when was the last time the president visited floridau', 'oos'],
  ['how can i keep my windshield from fogging up in winter', 'oos'],
  ['how do i compute the median of a set of numbers', 'oos'],
  ['what is happening with brexit right nowu', 'oos'],
  ['are there any new companies

In [4]:
len(data['train'])

7500

In [6]:
def get_dataset(data, dataset='train'):
    X = []
    Y = []
    for sample in data[dataset]:
        X.append(sample[0])
        Y.append(sample[1])
        
    return X, Y

In [7]:
X_raw = {}
Y = {}

datasets = ['train', 'val', 'test']

for dataset in datasets:
    X_raw[dataset], Y[dataset] = get_dataset(data, dataset)

### Download Word Embeddings model

In [16]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [20]:
word_embeddings = gensim.downloader.load('word2vec-google-news-300')

In [24]:
word_embeddings['banana']

array([-8.54492188e-02,  4.71191406e-02, -6.93359375e-02,  3.02734375e-01,
       -1.87500000e-01, -3.19824219e-02,  2.95410156e-02, -2.05078125e-01,
       -9.03320312e-02,  2.98828125e-01,  2.10937500e-01, -6.88476562e-02,
        4.76074219e-02,  5.83496094e-02, -2.75390625e-01,  2.42187500e-01,
       -3.22265625e-01,  4.73632812e-02, -1.44531250e-01,  9.37500000e-02,
        6.74438477e-03,  2.17773438e-01,  2.31445312e-01,  1.87500000e-01,
        8.49609375e-02, -5.39550781e-02, -2.65625000e-01,  2.23388672e-02,
        2.24609375e-01,  4.12109375e-01, -2.30712891e-02, -1.67968750e-01,
        1.01928711e-02,  2.10937500e-01,  1.14135742e-02,  2.50000000e-01,
        8.64257812e-02, -2.16796875e-01,  8.93554688e-02,  1.50390625e-01,
       -2.03125000e-01, -2.30468750e-01,  1.42578125e-01,  1.34765625e-01,
       -1.13769531e-01, -1.80664062e-01,  3.58886719e-02, -1.69921875e-01,
        1.55273438e-01,  2.39257812e-01, -2.30468750e-01, -9.96093750e-02,
        1.82617188e-01, -

In [102]:
word_embeddings.most_similar( word_embeddings['banana'] )

[('banana', 0.9999999403953552),
 ('bananas', 0.7523776292800903),
 ('pineapple', 0.6587538123130798),
 ('mango', 0.6365211009979248),
 ('pineapples', 0.6313878297805786),
 ('papaya', 0.6063666343688965),
 ('coconut', 0.6035483479499817),
 ('potato', 0.5747568607330322),
 ('melon', 0.5625775456428528),
 ('cashew', 0.562165379524231)]

In [48]:
word_embeddings['banana'].shape

(300,)

In [103]:
word_embeddings.most_similar(word_embeddings['king'])

[('king', 1.0),
 ('kings', 0.7138045430183411),
 ('queen', 0.6510957479476929),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204219460487366),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797566771507263),
 ('princes', 0.5646551847457886),
 ('Prince_Paras', 0.5432944297790527)]

In [93]:
word_embeddings.most_similar(word_embeddings['king']-word_embeddings['man'])

[('king', 0.7256942391395569),
 ('kings', 0.506108820438385),
 ('queen', 0.4603358507156372),
 ('monarch', 0.43495795130729675),
 ('Pansy_Ho_Chiu', 0.4200039803981781),
 ('princes', 0.41165217757225037),
 ('kingdom', 0.4114915728569031),
 ('Savory_aromas_wafted', 0.40622571110725403),
 ('crown_prince', 0.4018521010875702),
 ('sultan', 0.39996951818466187)]

In [104]:
word_embeddings.most_similar(word_embeddings['king']-word_embeddings['man']+word_embeddings['woman'])

[('king', 0.8449392318725586),
 ('queen', 0.7300518155097961),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676948547363),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613664388656616),
 ('sultan', 0.5376776456832886),
 ('Queen_Consort', 0.5344247221946716),
 ('queens', 0.5289887189865112)]

### Convert raw texts to Averaged Word Embeddings

In [66]:
def raw2embeddings(texts, word_embeddings):
    
    dim, = word_embeddings['the'].shape
    
    X = []
    for text in texts:
        tokens = nltk.word_tokenize(text)
        
        embeddings = [ word_embeddings[token] for token in tokens if token in word_embeddings ]
        
        if len(embeddings) > 0:
            X.append( np.mean(embeddings, axis=0) ) # ignoring Out-of-vocabulary (OOV) words
        else:
            X.append( np.zeros((dim,)) )
        
        
    return X

In [67]:
X = {}

for dataset in datasets:
    print(dataset)
    X[dataset] = raw2embeddings(X_raw[dataset], word_embeddings)

train
val
test


In [68]:
X_raw['train'][:10]

['can you walk me through setting up direct deposits to my bank of internet savings account',
 'i want to switch to direct deposit',
 'set up direct deposit for me',
 'how do i go about setting up direct deposit',
 'i need to get my paycheck direct deposited to my chase account',
 'what are the steps to set up direct deposit to my chase account',
 'if i would like to set up direct deposit, how do i do it',
 'how do i direct deposit my check',
 'what do i need to set up direct deposit',
 "i'd like to have my paycheck direct deposited to my chase account"]

In [69]:
X['train'][:10]

[array([ 0.0762855 ,  0.00645011,  0.03146798,  0.14508057, -0.03478241,
         0.00640869, -0.00864083, -0.09283011,  0.11404419,  0.08845956,
        -0.03682382, -0.07444109, -0.04024179,  0.01337542, -0.08944048,
         0.09496199,  0.10791349,  0.08383615,  0.03735024,  0.00890677,
         0.04986572,  0.04255022,  0.02531324,  0.05437905,  0.03842817,
        -0.02931431, -0.1288681 ,  0.04341779,  0.02807508, -0.05023193,
        -0.01401229, -0.03381784, -0.00138991, -0.05483573, -0.08256994,
        -0.12168666,  0.03402492, -0.09135219,  0.05735125,  0.11483329,
         0.06088802, -0.06535994,  0.14005825, -0.04929679, -0.01839338,
        -0.09693255, -0.00856236,  0.06907   , -0.04956273, -0.06101336,
         0.04177856,  0.04563795,  0.00173514,  0.01769911,  0.0176893 ,
         0.01819611, -0.11019353, -0.04416329,  0.00554548, -0.10962786,
        -0.07200841, -0.05707877, -0.01511492,  0.02324568, -0.03879656,
        -0.02351597, -0.12614223,  0.08545794,  0.0

## Train and evaluate classifiers

In [50]:
def train_and_evaluate_classifier(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    
    print(f'Accuracy: {accuracy_score(Y_test, y_pred)}')
    
    y_probs = clf.predict_proba(X_test)[:, 1]

In [70]:
train_and_evaluate_classifier( LogisticRegression(random_state=0), X['train'], Y['train'], X['test'], Y['test'])

Accuracy: 0.8755555555555555


In [105]:
train_and_evaluate_classifier( MLPClassifier(hidden_layer_sizes=(200,) ,random_state=1, max_iter=300, early_stopping=True, verbose=True), X['train'], Y['train'], X['test'], Y['test'])

Iteration 1, loss = 4.93996351
Validation score: 0.138667
Iteration 2, loss = 4.66393496
Validation score: 0.374667
Iteration 3, loss = 4.20909317
Validation score: 0.481333
Iteration 4, loss = 3.60805570
Validation score: 0.544000
Iteration 5, loss = 2.99217190
Validation score: 0.622667
Iteration 6, loss = 2.47321776
Validation score: 0.654667
Iteration 7, loss = 2.06862703
Validation score: 0.701333
Iteration 8, loss = 1.74504226
Validation score: 0.725333
Iteration 9, loss = 1.49439289
Validation score: 0.757333
Iteration 10, loss = 1.29347024
Validation score: 0.778667
Iteration 11, loss = 1.13718279
Validation score: 0.789333
Iteration 12, loss = 1.00602319
Validation score: 0.806667
Iteration 13, loss = 0.90090454
Validation score: 0.808000
Iteration 14, loss = 0.81585263
Validation score: 0.824000
Iteration 15, loss = 0.74218524
Validation score: 0.825333
Iteration 16, loss = 0.68138411
Validation score: 0.834667
Iteration 17, loss = 0.62580209
Validation score: 0.849333
Iterat

### Train and classify new samples

In [72]:
clf = MLPClassifier(random_state=1, max_iter=300, early_stopping=True, verbose=True)

In [73]:
clf.fit( X['train'], Y['train'] )

Iteration 1, loss = 4.97279776
Validation score: 0.060000
Iteration 2, loss = 4.81785782
Validation score: 0.169333
Iteration 3, loss = 4.57240047
Validation score: 0.346667
Iteration 4, loss = 4.22873471
Validation score: 0.452000
Iteration 5, loss = 3.79902547
Validation score: 0.514667
Iteration 6, loss = 3.33886790
Validation score: 0.594667
Iteration 7, loss = 2.91442065
Validation score: 0.652000
Iteration 8, loss = 2.54782953
Validation score: 0.693333
Iteration 9, loss = 2.24328576
Validation score: 0.710667
Iteration 10, loss = 1.98929633
Validation score: 0.733333
Iteration 11, loss = 1.77592341
Validation score: 0.742667
Iteration 12, loss = 1.60141193
Validation score: 0.769333
Iteration 13, loss = 1.44997203
Validation score: 0.778667
Iteration 14, loss = 1.32148081
Validation score: 0.788000
Iteration 15, loss = 1.20929102
Validation score: 0.810667
Iteration 16, loss = 1.11428871
Validation score: 0.804000
Iteration 17, loss = 1.03130386
Validation score: 0.817333
Iterat

In [75]:
data['test'][:10]

[['how would you say fly in italian', 'translate'],
 ["what's the spanish word for pasta", 'translate'],
 ['how would they say butter in zambia', 'translate'],
 ['how do you say fast in spanish', 'translate'],
 ["what's the word for trees in norway", 'translate'],
 ['how does one say wonderful in german', 'translate'],
 ['how do they say tacos in mexico', 'translate'],
 ['how would one say cruiser in china', 'translate'],
 ["what's the french word you use for potato", 'translate'],
 ['what would the word for grass be in finland', 'translate']]

In [79]:
#text = 'Could you bring me more info about direct deposits?'
#text = 'how would you say fly in italian'
text = 'tell me about chatgpt'

print(clf.predict( raw2embeddings([text], word_embeddings) ))
sorted( zip( clf.classes_, clf.predict_proba( raw2embeddings([text], word_embeddings) )[0] ), key=lambda x:x[1], reverse=True )[:10]

['fun_fact']


[('fun_fact', 0.6325321387029152),
 ('tell_joke', 0.3056758032100535),
 ('user_name', 0.040785019882282794),
 ('reminder', 0.0056939171707753354),
 ('balance', 0.0026525470661655625),
 ('reminder_update', 0.0023115669618345583),
 ('meaning_of_life', 0.0017456982759377201),
 ('income', 0.0014962225470690979),
 ('what_can_i_ask_you', 0.0010911273098250657),
 ('bill_balance', 0.0008358374635497117)]