## Traditional SML: Supervised Machine Learning models for PNR query detection


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = stopwords.words('dutch')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('dutch')

In [None]:
import matplotlib.pyplot as plt

# Train/test splits, cross validation, gridsearch
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

# class weights for NB
from sklearn.utils.class_weight import compute_class_weight

# vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# pipeline stuff
from sklearn.pipeline import make_pipeline, Pipeline

# Different models 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC 
#from sklearn.ensemble import RandomForestClassifier

# model evaluation
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, cohen_kappa_score, make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [None]:
## preprocessing functions

def transform_lowercase(x):
    return x.lower()

def remove_punctuation(x):
    return re.sub(r'[^\w\s]|_', '', x)

def remove_numbers(x):
    return re.sub(r'\d+', '', x)

def remove_links(x):
    return re.sub(r'http\S+', '', x)

def remove_linebreaks(x):
    return x.replace('\n', ' ').strip() # also remove double whitespace

def remove_stopwords(x):
    x = x.split(" ")
    x = " ".join([w for w in x if (w not in stopwords)&(w!="")]) # if not stop word or empty
    return x

def list_of_words(x):
    return x.split(" ")

def tokenize(text):
    words = word_tokenize(text.lower())
    words_stemmed = [stemmer.stem(word) for word in words]
    return words_stemmed

def preprocess(x):
    x = transform_lowercase(x)
    x = remove_punctuation(x)
    x = remove_numbers(x)
    return x

In [None]:
def make_table(model_list, names, y_test, X_test):
    '''
    Takes list of classification reports as dicts as input, and outputs one table with only
    '''
    new = []
    for model, name in zip(model_list, names):
        dct = classification_report(y_test, model.predict(X_test), output_dict=True)
        dct = dct['1']
        dct.update({'model':name})
        new.append(dct)
    new=pd.DataFrame(new).set_index('model')
    return new

In [None]:
# identical test-train split to BERT
X_train = np.load("data/train_test/X_train.npy", allow_pickle=True)
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True)
y_train = np.load("data/train_test/y_train.npy", allow_pickle=True)
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True)

In [None]:
# class balance in test and train data
print('test data:', np.bincount(y_test))
print('train data:', np.bincount(y_train))

In [None]:
# Scorers
f1_scorer = make_scorer(f1_score, pos_label=1)

In [None]:
# based on relative importance of each class in the dataset (inverse frequency)
class_weights1 = (1 - np.bincount(y_train)/len(y_train))
class_weights1 = {0:class_weights1[0], 1:class_weights1[1]}
class_weights1

# Logistic Regression with TfidfVectorizer

In [None]:
# Logistic Regression with Tfidf
pipeline_tfidf = Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(solver='lbfgs'))])

In [None]:
grid = {'vectorizer__stop_words':[None, stopwords], # stopword removal
        'vectorizer__max_df':[0.5, 1.0], # exclude terms in more than 50% or 100% of the docs
        'vectorizer__min_df':[1, 5], # exclude terms in less than 1 or 5 documents.
        'vectorizer__ngram_range':[(1,1), (1,2)], # consider unigrams, and both unigrams and bigrams
        'vectorizer__preprocessor':[None, preprocess], # lowercase, delete punct and numbers
        'vectorizer__analyzer':['word', tokenize], # word is default, tokenize is stemming
        'classifier__class_weight':[None, 'balanced', class_weights1] # class weights
        }

In [None]:
search_LR_T = GridSearchCV(estimator=pipeline_tfidf, # first vectorizer, then classifier
                      param_grid=grid, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=2, # print output
                      error_score='raise'
                      )

In [None]:
search_LR_T.fit(X_train, y_train)
print(f'Using these hyperparameters {search_LR_T.best_params_}, we get the best performance.')

In [None]:
LR_T = classification_report(y_test, search_LR_T.predict(X_test), output_dict=True)
print(classification_report(y_test, search_LR_T.predict(X_test)))

In [None]:
# save model
joblib.dump(search_LR_T.best_estimator_, 'SML/LR_T.pkl')

# Logistic Regression with CountVectorizer

In [None]:
pipeline_count = Pipeline(steps=[('vectorizer', CountVectorizer()), 
                           ('classifier', LogisticRegression(solver='lbfgs'))])

In [None]:
search_LR_C = GridSearchCV(estimator=pipeline_count, # first vectorizer, then classifier
                      param_grid=grid, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=2, # print output
                      error_score='raise'
                      )

In [None]:
search_LR_C.fit(X_train, y_train)
print(f'Using these hyperparameters {search_LR_C.best_params_}, we get the best performance.')

In [None]:
LR_C = classification_report(y_test, search_LR_C.predict(X_test), output_dict=True)
print(classification_report(y_test, search_LR_C.predict(X_test)))

In [None]:
# save model
joblib.dump(search_LR_C.best_estimator_, 'SML/LR_C.pkl')

# Naive Bayes with Tfidf

In [None]:
pp = np.bincount(y_train)/len(y_train)
print(pp)

In [None]:
pipeline_tfidf = Pipeline(steps=[('vectorizer', TfidfVectorizer()), 
                           ('classifier', MultinomialNB())])

In [None]:
gridNB = {'vectorizer__stop_words':[None, stopwords], # stopword removal
        'vectorizer__max_df':[0.5, 1.0], # exclude terms in more than 50% or 100% of the docs
        'vectorizer__min_df':[1, 5], # exclude terms in less than 1 or 5 documents.
        'vectorizer__ngram_range':[(1,1), (1,2)], # consider unigrams, and both unigrams and bigrams
        'vectorizer__preprocessor':[None, preprocess], # lowercase, delete punct and numbers
        'vectorizer__analyzer':['word', tokenize], # word is default, tokenize is stemming
        'classifier__class_prior':[None, pp]
        #'classifier__class_weight':[None, 'balanced'] # class weights not a parameter in NB
        }

In [None]:
search_NB_T = GridSearchCV(estimator=pipeline_tfidf, # first vectorizer, then classifier
                      param_grid=gridNB, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=2, # print output
                      error_score='raise'
                      )

In [None]:
search_NB_T.fit(X_train, y_train)
print(f'Using these hyperparameters {search_NB_T.best_params_}, we get the best performance.')

In [None]:
NB_T = classification_report(y_test, search_NB_T.predict(X_test), output_dict=True)
print(classification_report(y_test, search_NB_T.predict(X_test)))

In [None]:
# save model
joblib.dump(search_NB_T.best_estimator_, 'SML/NB_T.pkl')

# Naive Bayes with Count

In [None]:
pipeline_count = Pipeline(steps=[('vectorizer', CountVectorizer()), 
                           ('classifier', MultinomialNB())])

In [None]:
search_NB_C = GridSearchCV(estimator=pipeline_count, # first vectorizer, then classifier
                      param_grid=gridNB, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=2, # print output
                      error_score='raise'
                      )

In [None]:
search_NB_C.fit(X_train, y_train)
print(f'Using these hyperparameters {search_NB_C.best_params_}, we get the best performance.')

In [None]:
NB_C = classification_report(y_test, search_NB_C.predict(X_test), output_dict=True)
print(classification_report(y_test, search_NB_C.predict(X_test)))

In [None]:
# save model
joblib.dump(search_NB_C.best_estimator_, 'SML/NB_C.pkl')

# LinearSVC with Tfidf

In [None]:
pipeline_tfidf = Pipeline(steps=[('vectorizer', TfidfVectorizer()), 
                           ('classifier', LinearSVC())])

In [None]:
# cutting some of the parameters based on the ones that are most plausible based on previous models (since otherwise it will take forever.)
gridLSVC = {'vectorizer__stop_words':[None, stopwords], # stopword removal
        'vectorizer__max_df':[0.5, 1.0], # exclude terms in more than 50% or 100% of the docs
        'vectorizer__min_df':[1, 5], # exclude terms in less than 1 or 5 documents.
        'vectorizer__ngram_range':[(1,1), (1,2)], # consider unigrams, and both unigrams and bigrams
        'vectorizer__preprocessor':[None, preprocess], # lowercase, delete punct and numbers
        'vectorizer__analyzer':['word', tokenize], # word is default, tokenize is stemming
        'classifier__class_weight':[None, 'balanced', class_weights1], # balanced class weights
        'classifier__C':[0.01, 1, 100] # regularization parameter
        }

In [None]:
search_LSVC_T = GridSearchCV(estimator=pipeline_tfidf, # first vectorizer, then classifier
                      param_grid=gridLSVC, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=10, # print output
                      error_score='raise'
                      )

In [None]:
search_LSVC_T.fit(X_train, y_train)

In [None]:
print(f'Using these hyperparameters {search_LSVC_T.best_params_}, we get the best performance.')

In [None]:
LSVC_T = classification_report(y_test, search_LSVC_T.predict(X_test), output_dict=True)
print(classification_report(y_test, search_LSVC_T.predict(X_test)))

In [None]:
# save model
joblib.dump(search_LSVC_T.best_estimator_, 'SML/LSVC_T.pkl')

# LinearSVC with Count

In [None]:
pipeline_count = Pipeline(steps=[('vectorizer', CountVectorizer()), 
                           ('classifier', LinearSVC())])

In [None]:
search_LSVC_C = GridSearchCV(estimator=pipeline_count, # first vectorizer, then classifier
                      param_grid=gridLSVC, # test these paramaters
                      scoring=f1_scorer, # use f1 scorer for label==1
                      cv=5, # 5-fold cross validation
                      n_jobs=-1, #use all cpus
                      #verbose=10, # print output
                      error_score='raise'
                      )

In [None]:
search_LSVC_C.fit(X_train, y_train)

In [None]:
print(f'Using these hyperparameters {search_LSVC_C.best_params_}, we get the best performance.')

In [None]:
LSVC_C = classification_report(y_test, search_LSVC_C.predict(X_test), output_dict=True)
print(classification_report(y_test, search_LSVC_C.predict(X_test)))

In [None]:
# save model
joblib.dump(search_LSVC_C.best_estimator_, 'SML/LSVC_C.pkl')

# Report after hyperparameter optimization

In [None]:
# load models
LR_C = joblib.load('SML/LR_C.pkl')
LR_T = joblib.load('SML/LR_T.pkl')
NB_C = joblib.load('SML/NB_C.pkl')
NB_T = joblib.load('SML/NB_T.pkl')
LSVC_C = joblib.load('SML/LSVC_C.pkl')
LSVC_T = joblib.load('SML/LSVC_T.pkl')

In [None]:
models = [LR_C, LR_T, NB_C, NB_T, LSVC_C, LSVC_T]
names = ["Logistic Regression with Count", "Logistic Regression with Tfidf", "Naive Bayes with Count", "Naive Bayes with Tfidf", "Linear Vector Classification with Count", "Linear Support Vector Classification with Tfidf"]

In [None]:
report = make_table(models, names, y_test, X_test)
report

In [None]:
report.round(2).to_latex('SML/report_SML.txt')

In [None]:
confusion_matrix(y_test, LSVC_T.predict(X_test))

In [None]:
pd.DataFrame(classification_report(y_test, LSVC_T.predict(X_test), output_dict=True))

In [None]:
# full report
for model, name in zip(models, names):
    print(name)
    print(classification_report(y_test, model.predict(X_test)))
    print('\n')
    cr = pd.DataFrame(classification_report(y_test, LSVC_T.predict(X_test), output_dict=True))
    cr.round(2).to_latex(f'SML/results_{name}.txt')