#### Load required libs

In [1]:
import json

import nltk

from scipy.sparse import coo_matrix
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

### Load data

Dowload data from https://github.com/clinc/oos-eval/

In [2]:
f = open('../data/data_small.json')

data = json.load(f)

print(data.keys())

FileNotFoundError: ignored

In [None]:
data

In [None]:
len(data['train'])

In [None]:
def get_dataset(data, dataset='train'):
    X = []
    Y = []
    for sample in data[dataset]:
        X.append(sample[0])
        Y.append(sample[1])
        
    return X, Y

In [None]:
X_raw = {}
Y = {}

datasets = ['train', 'val', 'test']

for dataset in datasets:
    X_raw[dataset], Y[dataset] = get_dataset(data, dataset)

### Download Word Embeddings model

In [None]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

In [None]:
word_embeddings = gensim.downloader.load('word2vec-google-news-300')

In [None]:
word_embeddings['banana']

In [None]:
word_embeddings['banana'].shape

In [None]:
word_embeddings.most_similar('banana')

### Convert raw texts to Averaged Word Embeddings

In [None]:
def raw2embeddings(texts, word_embeddings):
    
    dim, = word_embeddings['the'].shape
    
    X = []
    for text in texts:
        tokens = nltk.word_tokenize(text)
        
        embeddings = [ word_embeddings[token] for token in tokens if token in word_embeddings ]
        
        if len(embeddings) > 0:
            X.append( np.mean(embeddings, axis=0) ) # ignoring Out-of-vocabulary (OOV) words
        else:
            X.append( np.zeros((dim,)))
        
        
    return X

In [None]:
X = {}

for dataset in datasets:
    print(dataset)
    X[dataset] = raw2embeddings(X_raw[dataset], word_embeddings)

In [None]:
X_raw['train'][:10]

In [None]:
X['train'][:10]

## Train and evaluate classifiers

In [None]:
def train_and_evaluate_classifier(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    
    print(f'Accuracy: {accuracy_score(Y_test, y_pred)}')
    
    y_probs = clf.predict_proba(X_test)[:, 1]

In [None]:
train_and_evaluate_classifier( LogisticRegression(random_state=0), X['train'], Y['train'], X['test'], Y['test'])

In [None]:
train_and_evaluate_classifier( MLPClassifier(random_state=1, max_iter=300, early_stopping=True, verbose=True), X['train'], Y['train'], X['test'], Y['test'])

### Train and classify new samples

In [None]:
clf = MLPClassifier(random_state=1, max_iter=300, early_stopping=True, verbose=True)

In [None]:
clf.fit( X['train'], Y['train'] )

In [None]:
data['test'][:10]

In [None]:
#text = 'Could you bring me more info about direct deposits?'
#text = 'how would you say fly in italian'
text = 'tell me about chatgpt'

print(clf.predict( raw2embeddings([text], word_embeddings) ))
sorted( zip( clf.classes_, clf.predict_proba( raw2embeddings([text], word_embeddings) )[0] ), key=lambda x:x[1], reverse=True )[:10]