In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import ComplementNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import BorderlineSMOTE, ADASYN
from collections import defaultdict
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import balanced_accuracy_score, classification_report, roc_auc_score
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.classifier import ROCAUC
import warnings
random_state=42

Using TensorFlow backend.


In [None]:
#data = pd.read_csv("/Users/max/Quick Jupyter Notebooks/MMAI/MMAI 823 - Finance/Bankruptcy_data_Final.csv", header=0)

In [None]:
#data_noNa = data.dropna()
data_noNa = pd.read_csv('data/cleaned_data.csv')

In [None]:
corr = data_noNa.corr()

In [None]:
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

In [None]:
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
scaled_data_noNan = data_noNa.copy()

In [None]:
# Scale values using min/max if there are only positive values, 
# and standard scaler if the minimum value is less than 0 to capture the negative values

# STRIKE THIS: NAIVE BAYS CAN ONLY HAVE NON-NEGATIVE VALUES

for (columnName, columnData) in scaled_data_noNan.iteritems():
    scaled_data_noNan[columnName] = minmax_scaler.fit_transform(columnData.values.reshape(-1,1))

X = scaled_data_noNan.iloc[:, :13]
y = scaled_data_noNan.iloc[:,-1:]
# Stratified Shuffle Split into Train/Test
sss = StratifiedShuffleSplit(test_size=0.2, random_state=random_state)
sss.get_n_splits(X, y)

for train_idx, test_idx in sss.split(X,y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].values, y.iloc[test_idx].values

In [None]:
# Resampling

resampled = defaultdict()

# SMOTEN-NN
resampler = SMOTEENN(random_state=random_state)
X_rs, y_rs = resampler.fit_resample(X_train,y_train)
X_rs["BK"] = y_rs
resampled['SMOTEENN'] = X_rs

# SMOTE-Tomek
resampler = SMOTETomek(random_state=random_state)
X_rs, y_rs = resampler.fit_resample(X_train,y_train)
X_rs["BK"] = y_rs
resampled['SMOTETomek'] = X_rs

#BorderlineSMOTE
resampler = BorderlineSMOTE(random_state=random_state)
X_rs, y_rs = resampler.fit_resample(X_train,y_train)
X_rs["BK"] = y_rs
resampled['BorderlineSMOTE'] = X_rs

#ADASYN
resampler = ADASYN(random_state=random_state)
X_rs, y_rs = resampler.fit_resample(X_train,y_train)
X_rs["BK"] = y_rs
resampled['ADASYN'] = X_rs


## Assumptions
 - Naive Bayes classifiers assume that all predictors have equal effect on the outcome, however we are fairly confident that this is not the case

In [None]:
# corr_normalized = scaled_data_noNan.corr()

# ax = sns.heatmap(
#     corr_normalized, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# )

In [None]:
model = ComplementNB()

In [None]:
# Baseline Model - No Sampling

model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Balanced Accuracy Score
rf_score = balanced_accuracy_score(y_test, predictions)
# AUC Score
rf_auc = roc_auc_score(y_test, predictions)

print('Benchmark Balanced Accuracy: ' + str(rf_score))
print('Benchmark AUC Score:' + str(rf_auc))
print(classification_report(y_test, predictions, digits=3))


In [None]:
for set in resampled:
    warnings.filterwarnings("ignore")

    X_train = resampled[set].iloc[:, :13]
    y_train = resampled[set].iloc[:,-1:].values.reshape(-1,1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Balanced Accuracy Score
    rf_score = balanced_accuracy_score(y_test.reshape(-1,1), predictions.reshape(-1,1))
    # AUC Score
    rf_auc = roc_auc_score(y_test.reshape(-1,1), predictions.reshape(-1,1))
    print('Resampled Using ' + str(set))
    print('Benchmark Balanced Accuracy: ' + str(rf_score))
    print('Benchmark AUC Score:' + str(rf_auc))
    print(classification_report(y_test.reshape(-1,1), predictions.reshape(-1,1), digits=3))