In [22]:
# import packages
import re
import os
import numpy as np 
import pandas as pd 
from sklearn.svm import SVC
import plotly.express as px
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import recall_score,precision_score,make_scorer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier,EasyEnsembleClassifier,RUSBoostClassifier
pd.set_option('display.max_rows', 700)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [23]:
# load datsets 
train = pd.read_json('/content/is_train.json')
val = pd.read_json('/content/is_val.json')
test = pd.read_json('/content/is_test.json')
oos_train = pd.read_json('/content/oos_train.json')
oos_val = pd.read_json('/content/oos_val.json')
oos_test = pd.read_json('/content/oos_test.json')
files = [(train,'train'),(val,'val'),(test,'test'),(oos_train,'oos_train'),(oos_val,'oos_val'),(oos_test,'oos_test')]
for file,name in files:
    file.columns = ['text','intent']
    print(f'{name} shape:{file.shape}, {name} has {train.isna().sum().sum()} null values')
in_train = train.copy()

train shape:(15000, 2), train has 0 null values
val shape:(3000, 2), val has 0 null values
test shape:(4500, 2), test has 0 null values
oos_train shape:(100, 2), oos_train has 0 null values
oos_val shape:(100, 2), oos_val has 0 null values
oos_test shape:(1000, 2), oos_test has 0 null values


In [24]:
# functions for preprocessing and evaluation

def binarize(df):
    df.intent = np.where(df.intent!='oos',0,1)
    return df

def vectorizer(X):
    cv = CountVectorizer(min_df=1,ngram_range=(1,2))
    X_en = cv.fit_transform(X)
    return cv,X_en

def labelencoder(y):
    le = LabelEncoder()
    le.fit(y)
    y_enc = le.transform(y)
    return le,y_enc

def preprocess(train):
    X = train.text
    y = train.intent
    le,y = labelencoder(y)
    cv,X = vectorizer(X)
    return X,y,cv,le

def process_non_train(df,cv,le):
    X = df.text
    y = df.intent
    X = cv.transform(X)
    y = le.transform(y)
    return X,y

def get_score(clf,binary=0):
    clf.fit(X_train,y_train)
    if binary==1:
        y_pred = clf.predict(X_test)
        return clf,clf.score(X_val,y_val),clf.score(X_test,y_test),recall_score(y_test,y_pred),precision_score(y_test,y_pred)
    elif binary==0:
        return clf,clf.score(X_val,y_val),clf.score(X_test,y_test)

In [25]:
# In-scope classification
# pre-process data
X_train,y_train,cv,le = preprocess(in_train)
X_val,y_val = process_non_train(val,cv,le)
X_test,y_test = process_non_train(test,cv,le)

In [26]:
# evaluation using different classification model
validation_accuracy = []
test_accuracy = []
names = []

models = [(MultinomialNB(),'Multinomial Naive Bayes classifier'), (RandomForestClassifier(),'Random Forest classifier'), 
          (SVC(kernel='linear'),'Linear SVC'), (SGDClassifier(), 'SGD classifier')]

for model,name in models:
    clf,score,test_score = get_score(model,0)
    names.append(name)
    validation_accuracy.append(score*100)
    test_accuracy.append(test_score*100)
pd.DataFrame(data=zip(validation_accuracy,test_accuracy),index=names,columns=['validation_accuracy','test_accuracy']).style.background_gradient()

Unnamed: 0,validation_accuracy,test_accuracy
Multinomial Naive Bayes classifier,84.833333,85.488889
Random Forest classifier,86.233333,87.022222
Linear SVC,87.933333,87.266667
SGD classifier,91.166667,90.933333


In [27]:
# for in-scope and out-scope classification (binary)
oos_plus_train = binarize(pd.concat([in_train,oos_train],axis=0).reset_index(drop=True))
oos_plus_val = binarize(pd.concat([val,oos_val],axis=0).reset_index(drop=True))
oos_plus_test = binarize(pd.concat([test,oos_test],axis=0).reset_index(drop=True))

In [28]:
oos_count = oos_plus_train.intent.value_counts()
oos_count

0    15000
1      100
Name: intent, dtype: int64

In [29]:
#pre-processing for in-scope and out-scope classification
X_train,y_train,cv,le = preprocess(oos_plus_train)
X_val,y_val = process_non_train(oos_plus_val,cv,le)
X_test,y_test = process_non_train(oos_plus_test,cv,le)

In [30]:
# evaluation using different classification model
val_accuracy = []
test_accuracy = []
recall = []
names = []
precision = []

models = [(RUSBoostClassifier(base_estimator=LogisticRegression(),sampling_strategy='not minority',random_state=111),'Random Undersampling + Adaboost classifier'),
          (EasyEnsembleClassifier(n_estimators=30,base_estimator=LogisticRegression(),replacement=True,sampling_strategy='not minority',random_state=111),'Easy Ensemble classifier')]

for model,name in models:
    _,score,test_score,recall_sc,precision_sc = get_score(model,1)
    names.append(name)
    val_accuracy.append(score*100)
    test_accuracy.append(test_score*100)
    recall.append(recall_sc*100)
    precision.append(precision_sc*100)
pd.DataFrame(data=zip(val_accuracy,test_accuracy,recall,precision),index=names,
             columns=['val_accuracy','test_accuracy','recall_score','precision_score']).style.background_gradient()

Unnamed: 0,val_accuracy,test_accuracy,recall_score,precision_score
Random Undersampling + Adaboost classifier,76.612903,71.472727,47.6,31.295201
Easy Ensemble classifier,77.645161,72.690909,49.8,33.244326
