In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# Modeling

In [4]:
df = pd.read_csv('target_features.csv', index_col=0)

In [10]:
df.shape

(7276, 136)

In [8]:
target = df.iloc[:, 0]
features = df.iloc[:, 1:]

In [16]:
print(target.shape)
print(features.shape)

(7276,)
(7276, 135)


Create a holdout group of 20% that we can test our best models on at the end (in order to verify that our model was generalized.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2)

In [38]:
robust_scaler = RobustScaler()
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

In [39]:
outlier_scaled_X_train = robust_scaler.fit_transform(X_train)
minmax_scaled_X_train = minmax_scaler.fit_transform(X_train)
standard_scaled_X_train = standard_scaler.fit_transform(X_train)
minmax_outlier_scaled_X_train = minmax_scaler.fit_transform(outlier_scaled_X_train)

## Naive Bayes Classifier

In [34]:
mn_nb_clf = MultinomialNB()
bn_nb_clf = BernoulliNB()

In [43]:
cross_val_score(mn_nb_clf, minmax_outlier_scaled_X_train, y_train, cv=5, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.5s finished


0.7513658605238794

In [44]:
cross_val_score(bn_nb_clf, outlier_scaled_X_train, y_train, cv=5, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.9s finished


0.7444924137804569

## Extra-trees Classifier

In [45]:
et_clf = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.7500000000000001, min_samples_leaf=1, min_samples_split=19, n_estimators=100)

In [47]:
cross_val_score(et_clf, X_train, y_train, cv=5, n_jobs=-1, verbose=True).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.0s finished


0.9484522504368751