## Notebook to perform intent classification on the data from  https://github.com/clinc/oos-eval
### Use data_full.json file 

### Imports

In [10]:
import json
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import joblib

nltk.download( 'stopwords' )

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikunjkotecha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load the data

In [11]:
class TextData:
    def __init__( self, file ):
        self.file = file
    
    def read_json( self, set='train', cols=['sent'], target='intent' ):
        '''
        read the json file and obtained the in-scope set
        '''
        with open( self.file, 'r' ) as f:
            data = json.load( f )
        df = pd.DataFrame( data[set], columns=cols+[target] )
        return df
    
    def random_labels( self, target='intent', n=20, seed=0 ):
        '''
        choose 'n' random intent classes
        '''
        df = self.read_json( target=target )
        np.random.seed( seed )
        labels = np.random.choice( df[target].unique(), size=n, replace=False )
        labels = { val:idx for idx,val in enumerate( labels ) }
        return labels
    
    def get_set( self, set, labels, target='intent' ):
        '''
        obtained the in-scope set of given intent classes
        '''
        df = self.read_json( set=set )
        df = df[ df[target].isin( labels.keys() ) ].reset_index( drop=True )
        return df
    
file = 'data_full.json'
data = TextData( file )
# obtain random intent classes
labels = data.random_labels()

# get the training and validation set
train = data.get_set( 'train', labels )
val = data.get_set( 'val', labels )
train.head()

Unnamed: 0,sent,intent
0,does ireland have any travel alerts i should b...,travel_alert
1,does north korea have any travel alerts i shou...,travel_alert
2,are there any travel alerts for russia,travel_alert
3,does spain have any travel alerts i should be ...,travel_alert
4,are there any travel alerts for north korea,travel_alert


### Data Cleaning

In [12]:
class TextCleaning:
    def __init__( self ):
        # punctuations, stop word, lemmatizer
        self.punctuations = string.punctuation
        self.stop_words = stopwords.words( 'english' )
        self.lemmatizer = WordNetLemmatizer()

    def process( self, sentence ):
        '''
        remove punctuations, stop word to a given sentence
        then lemmatize each word
        '''
        temp = []
        for w in word_tokenize( sentence ):
            if w not in self.punctuations and w not in self.stop_words:
                w = self.lemmatizer.lemmatize(w)
                if w not in temp:
                    temp.append(w)
        return ' '.join( temp )

    def preprocess( self, df, col='sent' ):
        '''
        accept a dataframe and preprocess its sentences
        '''
        # clean the data
        df['clean'] = df.apply( lambda row: self.process( row[col] ), axis=1 )
        return df

data_cleaning = TextCleaning()
# clean the training and validation set
train = data_cleaning.preprocess( train )
val = data_cleaning.preprocess( val )
train.head()

Unnamed: 0,sent,intent,clean
0,does ireland have any travel alerts i should b...,travel_alert,ireland travel alert aware
1,does north korea have any travel alerts i shou...,travel_alert,north korea travel alert aware
2,are there any travel alerts for russia,travel_alert,travel alert russia
3,does spain have any travel alerts i should be ...,travel_alert,spain travel alert aware
4,are there any travel alerts for north korea,travel_alert,travel alert north korea


### Performing feature extraction

In [13]:
class FeatureExtraction:
    '''
    This data contains textual data. In order to train the model,
    we need text embeddings. To obtain these embeddings, we can use
    tfidf approach
    '''
    def __init__( self ):
        pass

    def tfidf_fit( self, df, col='clean' ):
        '''
        fit the tfidf to the training data
        '''
        vectorizer = TfidfVectorizer()
        vectorizer.fit( df[col] )
        return vectorizer

    def tfidf_extract( self, df, vectorizer, col='clean' ):
        '''
        extract tfidf features on the data
        '''
        X = vectorizer.transform( df[col] )
        return X.toarray()

    def categorical_to_int( self, df, labels, target='intent' ):
        '''
        convert the categorical intent classes to int
        '''
        y = df.apply( lambda row: labels.get(row[target]), axis=1 )
        return y

feature_extract = FeatureExtraction()
# fit tfidf vectorizer
vectorizer = feature_extract.tfidf_fit( train )

# extract features on train and validation set
train_X = feature_extract.tfidf_extract( train, vectorizer )
train_y = feature_extract.categorical_to_int( train, labels )
val_X = feature_extract.tfidf_extract( val, vectorizer )
val_y = feature_extract.categorical_to_int( val, labels )

train_X.shape, train_y.shape

((2000, 1038), (2000,))

### Training on train set and evaluating the model on validation set

In [14]:
class TextModel:
    def __init__( self ):
        pass

    def train( self, X_train, y_train, params, n_jobs=-1, cv=3, random_state=0 ):
        rf = RandomForestClassifier( n_jobs=-1, random_state=0 )
        grid = GridSearchCV( rf, params, cv=3, n_jobs=-1 )
        grid.fit( X_train, y_train )
        return grid

    def save_model( self, pickle_file, grid, labels, vectorizer, compress=3 ):
        # save the model
        obj = {
            'model': grid.best_estimator_,
            'vectorizer': vectorizer,
            'labels': labels
        }
        joblib.dump( obj, pickle_file, compress=3 )
        return

    def load_model( self, pickle_file ):
        obj = joblib.load( pickle_file )
        return obj

    def inference( self, model, X_test, y_test ):
        y_pred = model.predict( X_test )
        # evaluate the model
        report = classification_report( y_test, y_pred )
        return report

pickle_file = 'my_model.pkl'
text_model = TextModel()
'''
The parameters for RandomForest can be adjusted and grid search can be used
to find the best one. RandomForest have different kind of parameter that can be
tuned. Here, we tune for number of trees and max features
'''
params = { 'n_estimators': [100, 300],
            'max_features': ['sqrt', 0.5, 1.0]
         }
# train RF model on the training set
grid = text_model.train( train_X, train_y, params )
# save the model
text_model.save_model( pickle_file, grid, labels, vectorizer )
# load the mmodel
obj = text_model.load_model( pickle_file )
# evaluate the model on the validation set
report = text_model.inference( obj['model'], val_X, val_y )
print( report )

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      0.80      0.89        20
           2       1.00      1.00      1.00        20
           3       0.92      0.60      0.73        20
           4       0.95      0.95      0.95        20
           5       0.94      0.75      0.83        20
           6       1.00      0.95      0.97        20
           7       0.86      0.95      0.90        20
           8       0.62      1.00      0.77        20
           9       1.00      0.70      0.82        20
          10       1.00      0.95      0.97        20
          11       0.83      1.00      0.91        20
          12       1.00      1.00      1.00        20
          13       0.95      1.00      0.98        20
          14       1.00      0.90      0.95        20
          15       1.00      1.00      1.00        20
          16       1.00      1.00      1.00        20
          17       0.94    

### Reporting results on the test set from the trained model

In [15]:
# obtain the test set, preprocess it and extract the features
# load the model and evaluate on the test set
pickle_file = 'my_model.pkl'
text_model = TextModel()
obj = text_model.load_model( pickle_file )

file = 'data_full.json'
data = TextData( file )
test = data.get_set( 'test', obj['labels'] )

data_cleaning = TextCleaning()
test = data_cleaning.preprocess( test )

feature_extract = FeatureExtraction()
test_X = feature_extract.tfidf_extract( test, obj['vectorizer'] )
test_y = feature_extract.categorical_to_int( test, labels )
# evaluate the model on the test set
report = text_model.inference( obj['model'], test_X, test_y )
print( report )

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       0.96      0.77      0.85        30
           2       1.00      0.97      0.98        30
           3       1.00      0.87      0.93        30
           4       0.97      0.97      0.97        30
           5       0.91      0.97      0.94        30
           6       1.00      1.00      1.00        30
           7       0.90      0.93      0.92        30
           8       0.68      1.00      0.81        30
           9       1.00      0.90      0.95        30
          10       0.97      0.97      0.97        30
          11       0.97      1.00      0.98        30
          12       1.00      0.97      0.98        30
          13       1.00      1.00      1.00        30
          14       1.00      0.97      0.98        30
          15       1.00      0.97      0.98        30
          16       1.00      0.97      0.98        30
          17       0.92    