In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# models
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Data

In [2]:
train_df = pd.read_csv('embeddings/full_train_df.csv')
test_df = pd.read_csv('embeddings/full_test_df.csv')
train_x = train_df.drop(columns=['target'])
train_y = train_df['target']
test_x = test_df.drop(columns=['target'])
y_true = test_df['target']

# Classifiers

In [3]:
accs = []
prcs = []
recs = []
nams = []

def get_metrics(model, model_name=''):
    model.fit(train_x, train_y)
    y_pred = model.predict(test_x)
    a = accuracy_score(y_true,y_pred)
    p = precision_score(y_true,y_pred)
    r = recall_score(y_true,y_pred)
    print(model_name+' accuracy: '+str(a))
    print(model_name+' precision: '+str(p))
    print(model_name+' recall: '+str(r))
    accs.append(a)
    prcs.append(p)
    recs.append(r)
    nams.append(model_name)

## SVM

In [4]:
mnm = 'SVM'
svm = svm.SVC()
get_metrics(svm,'SVM')

SVM accuracy: 0.8628
SVM precision: 0.8593615185504746
SVM recall: 0.9915380786460926


## Naive Bayes

In [5]:
gnb = GaussianNB()
get_metrics(gnb,'Naive Bayes')

Naive Bayes accuracy: 0.7052
Naive Bayes precision: 0.8192771084337349
Naive Bayes recall: 0.812344449975112


## LightGBM

In [6]:
lgbm = LGBMClassifier(objective='binary', random_state=33)
get_metrics(lgbm,'LightGBM')

LightGBM accuracy: 0.8524
LightGBM precision: 0.8568320278503047
LightGBM recall: 0.9800895968143355


## Multi-layer Perceptron (DNN)

In [7]:
dnn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=33)
get_metrics(dnn,'DNN')

DNN accuracy: 0.7008
DNN precision: 0.8129032258064516
DNN recall: 0.8153310104529616


# Inference with LightGBM

In [8]:
# LightGBM parameters found by Bayesian optimization
clf = LGBMClassifier(objective='binary', random_state=33)
clf.fit(train_x, train_y, 
        eval_set=[(train_x, train_y), (test_x, y_true)], 
        eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 300)

Training until validation scores don't improve for 300 rounds
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.99999	training's binary_logloss: 0.106224	valid_1's auc: 0.843416	valid_1's binary_logloss: 0.356579


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=33, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)