#### Load required libs

In [68]:
import json

import nltk

from scipy.sparse import coo_matrix
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

### Load data

Dowload data from https://github.com/clinc/oos-eval/

In [7]:
f = open('../data/data_small.json')

data = json.load(f)

print(data.keys())

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])


In [9]:
data['train'][:10]

[['can you walk me through setting up direct deposits to my bank of internet savings account',
  'direct_deposit'],
 ['i want to switch to direct deposit', 'direct_deposit'],
 ['set up direct deposit for me', 'direct_deposit'],
 ['how do i go about setting up direct deposit', 'direct_deposit'],
 ['i need to get my paycheck direct deposited to my chase account',
  'direct_deposit'],
 ['what are the steps to set up direct deposit to my chase account',
  'direct_deposit'],
 ['if i would like to set up direct deposit, how do i do it',
  'direct_deposit'],
 ['how do i direct deposit my check', 'direct_deposit'],
 ['what do i need to set up direct deposit', 'direct_deposit'],
 ["i'd like to have my paycheck direct deposited to my chase account",
  'direct_deposit']]

In [10]:
def get_dataset(data, dataset='train'):
    X = []
    Y = []
    for sample in data[dataset]:
        X.append(sample[0])
        Y.append(sample[1])
        
    return X, Y

In [33]:
X_raw = {}
Y = {}

datasets = ['train', 'val', 'test']

for dataset in datasets:
    X_raw[dataset], Y[dataset] = get_dataset(data, dataset)

### Build vocabulary

In [17]:
vocabulary = {}

for text in X_raw['train']:
    
    tokens = nltk.word_tokenize(text)
    
    for token in tokens:
        if token not in vocabulary:
            # add to vocabulary and set with a unique ID
            vocabulary[ token ] = len(vocabulary)

### Convert raw texts to Bag-of-Words

In [18]:
def raw2ids(texts, vocabulary):
    
    X_bow = []
    for text in texts:
        tokens = nltk.word_tokenize(text)
        
        X_bow.append( [ vocabulary[token] for token in tokens if token in vocabulary ] )
        
    return X_bow

In [22]:
X_bow = {}

for dataset in datasets:
    X_bow[dataset] = raw2ids(X_raw[dataset], vocabulary)

In [24]:
X['train'][:10]

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 [16, 17, 9, 18, 9, 7, 19],
 [20, 6, 7, 19, 21, 3],
 [22, 23, 16, 24, 25, 5, 6, 7, 19],
 [16, 26, 9, 27, 10, 28, 7, 29, 9, 10, 30, 15],
 [31, 32, 33, 34, 9, 20, 6, 7, 19, 9, 10, 30, 15],
 [35, 16, 36, 37, 9, 20, 6, 7, 19, 38, 22, 23, 16, 23, 39],
 [22, 23, 16, 7, 19, 10, 40],
 [31, 23, 16, 26, 9, 20, 6, 7, 19],
 [16, 41, 37, 9, 42, 10, 28, 7, 29, 9, 10, 30, 15]]

### Convert to a sparse matrix

In [29]:
X = {}

for dataset in datasets:
    
    rows = []
    cols = []
    bow_data = []
    
    for sample_id, token_ids in enumerate(X_bow[dataset]):
        for token_id in token_ids:
            rows.append(sample_id)
            cols.append(token_id)
            bow_data.append( 1 )
            
    X[dataset] = coo_matrix((bow_data, (rows, cols)), shape=(len(X_bow[dataset]), len(vocabulary)))

## Train and evaluate classifiers

In [45]:
def train_and_evaluate_classifier(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    
    print(f'Accuracy: {accuracy_score(Y_test, y_pred)}')
    
    y_probs = clf.predict_proba(X_test)[:, 1]

In [46]:
train_and_evaluate_classifier( LogisticRegression(random_state=0), X['train'], Y['train'], X['test'], Y['test'])

Accuracy: 0.8791111111111111


In [50]:
train_and_evaluate_classifier( MLPClassifier(random_state=1, max_iter=300, early_stopping=True, verbose=True), X['train'], Y['train'], X['test'], Y['test'])

Iteration 1, loss = 4.92711359
Validation score: 0.244000
Iteration 2, loss = 4.55208122
Validation score: 0.577333
Iteration 3, loss = 3.94025841
Validation score: 0.708000
Iteration 4, loss = 3.13677975
Validation score: 0.785333
Iteration 5, loss = 2.27871093
Validation score: 0.837333
Iteration 6, loss = 1.57632713
Validation score: 0.856000
Iteration 7, loss = 1.10201699
Validation score: 0.868000
Iteration 8, loss = 0.80392620
Validation score: 0.884000
Iteration 9, loss = 0.61165137
Validation score: 0.890667
Iteration 10, loss = 0.48219158
Validation score: 0.889333
Iteration 11, loss = 0.39000155
Validation score: 0.894667
Iteration 12, loss = 0.32163665
Validation score: 0.897333
Iteration 13, loss = 0.27087143
Validation score: 0.902667
Iteration 14, loss = 0.23025345
Validation score: 0.910667
Iteration 15, loss = 0.19826152
Validation score: 0.912000
Iteration 16, loss = 0.17223923
Validation score: 0.912000
Iteration 17, loss = 0.15072341
Validation score: 0.910667
Iterat

### Train and classify new samples

In [51]:
clf = MLPClassifier(random_state=1, max_iter=300, early_stopping=True, verbose=True)

In [52]:
clf.fit( X['train'], Y['train'] )

Iteration 1, loss = 4.92711359
Validation score: 0.244000
Iteration 2, loss = 4.55208122
Validation score: 0.577333
Iteration 3, loss = 3.94025841
Validation score: 0.708000
Iteration 4, loss = 3.13677975
Validation score: 0.785333
Iteration 5, loss = 2.27871093
Validation score: 0.837333
Iteration 6, loss = 1.57632713
Validation score: 0.856000
Iteration 7, loss = 1.10201699
Validation score: 0.868000
Iteration 8, loss = 0.80392620
Validation score: 0.884000
Iteration 9, loss = 0.61165137
Validation score: 0.890667
Iteration 10, loss = 0.48219158
Validation score: 0.889333
Iteration 11, loss = 0.39000155
Validation score: 0.894667
Iteration 12, loss = 0.32163665
Validation score: 0.897333
Iteration 13, loss = 0.27087143
Validation score: 0.902667
Iteration 14, loss = 0.23025345
Validation score: 0.910667
Iteration 15, loss = 0.19826152
Validation score: 0.912000
Iteration 16, loss = 0.17223923
Validation score: 0.912000
Iteration 17, loss = 0.15072341
Validation score: 0.910667
Iterat

In [57]:
def sample2sparsebow(text, vocabulary):
    token_ids = raw2ids([text], vocabulary)[0]
    
    row = []
    col = []
    data = []
    for token_id in token_ids:
        row.append( 0 )
        col.append(token_id)
        data.append( 1 )

    return coo_matrix((data, (row, col)), shape=(1, len(vocabulary)))

In [64]:
data['test'][:10]

[['how would you say fly in italian', 'translate'],
 ["what's the spanish word for pasta", 'translate'],
 ['how would they say butter in zambia', 'translate'],
 ['how do you say fast in spanish', 'translate'],
 ["what's the word for trees in norway", 'translate'],
 ['how does one say wonderful in german', 'translate'],
 ['how do they say tacos in mexico', 'translate'],
 ['how would one say cruiser in china', 'translate'],
 ["what's the french word you use for potato", 'translate'],
 ['what would the word for grass be in finland', 'translate']]

In [77]:
text = 'Could you bring me more info about direct deposits?'
#text = 'how would you say fly in italian'
text = 'tell me about chatgpt'

print(clf.predict( sample2sparsebow(text, vocabulary) ))
sorted( zip( clf.classes_, clf.predict_proba( sample2sparsebow(text, vocabulary) )[0] ), key=lambda x:x[1], reverse=True )[:10]

['fun_fact']


[('fun_fact', 0.21351375483378932),
 ('tell_joke', 0.1016862062356891),
 ('nutrition_info', 0.03668497376784672),
 ('what_can_i_ask_you', 0.02393525224901844),
 ('income', 0.02341061493186106),
 ('user_name', 0.021216043636866376),
 ('greeting', 0.018053634707703936),
 ('restaurant_reviews', 0.017634399988995103),
 ('transactions', 0.017045181252974658),
 ('reminder', 0.014927251780542198)]