## Matthew Kearney - Final Project - Group 3 

### SECTION3 : Train Model-Stacking Method


imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv('Section1&2(Clusters).csv')

Splitting the data

In [3]:
X = df.iloc[:,:-3]
y = df.iloc[:,-2]

In [4]:
X4 = X[df.iloc[:,-1] == 4] 
X5 = X[df.iloc[:,-1] == 5]

y4 = y[df.iloc[:,-1] == 4]
y5 = y[df.iloc[:,-1] == 5]


print("Cluster 4 size:", len(X4), "\n\tNumber bankrupt: ", y4.sum(), "\nCluster 5 size", len(X5), "\n\tNumber bankrupt: ", y5.sum())

Cluster 4 size: 702 
	Number bankrupt:  137 
Cluster 5 size 1215 
	Number bankrupt:  0


# Cluster 4

### KNN model

Any number of neighbors (reasonably high) produces high accuracy. Distance weighting should also be used opposed to uniform. Uniform weights will not be able to place importance by the closeness of points to its neighbors, and thus is generally regarded as less strong. This produces our accuracy up to 1.0

In [5]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')  # n_neighbors and weights
knn.fit(X4, y4)
knn_pred = knn.predict(X4)
knn_accuracy = accuracy_score(y4, knn_pred)
knn_conf = confusion_matrix(y4, knn_pred )

print(f"{knn_accuracy}")
print(f"{knn_conf}")

1.0
[[565   0]
 [  0 137]]


KNN: <br>
TT / (TT + TF) = 0.8048433

### Random Forest Classification

Using 64-128 decision trees is typically suggested, but 'max_depth' was found as a more useful hyperparameter to tune which we increased to 12 and reached 100% classification accuracy.

In [6]:
rf = RandomForestClassifier(n_estimators=64, max_depth=12, random_state=42)  # number of trees and maximum depth
rf.fit(X4, y4)
rf_pred = rf.predict(X4)
rf_accuracy = accuracy_score(y4, rf_pred)
rf_conf = confusion_matrix(y4, rf_pred )

print(f"{rf_accuracy}")
print(f"{rf_conf}")

1.0
[[565   0]
 [  0 137]]


RF: <br>
TT / (TT + TF) = 0.8048433

### Gradient Boosting

Max depth at 5 allows for full accuracy with gradient boosting and the learning rate is found at 0.1 as a solid tradeoff in overfit-underfit. Number of trees should be relatively high to achieve high accuracy.

In [7]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)  # number of trees, learning rate, and maximum depth
gb.fit(X4, y4)
gb_pred = gb.predict(X4)
gb_accuracy = accuracy_score(y4, gb_pred)
gb_conf = confusion_matrix(y4, gb_pred )

print(f"{gb_accuracy}")
print(f"{gb_conf}")

1.0
[[565   0]
 [  0 137]]


GB: <br>
TT / (TT + TF) = 0.8048433

Using the same clusters results in the same confusion matrix as we get 100% accuracy

Let's store the newly obtained data

In [8]:
df4 = pd.DataFrame()
df4['knn']=knn_pred
df4['rf']=rf_pred
df4['gb']=gb_pred

meta_learner = LogisticRegression(class_weight='balanced', random_state=42)
meta_learner.fit(df4, y4)
meta_learner_pred = meta_learner.predict(df4)

meta_learner_cm = confusion_matrix(y4, meta_learner_pred)
print("Meta learner Confusion Matrix:\n",meta_learner_cm)
accuracy_score_meta = accuracy_score(y4, meta_learner_pred)
print("Meta learner Accuracy:\n", accuracy_score_meta)

Meta learner Confusion Matrix:
 [[565   0]
 [  0 137]]
Meta learner Accuracy:
 1.0


We have accurately classified 137 bankrupt companies and 565 non-bankrupt companies.

In [9]:
# # Save the meta learner
# from joblib import dump
# dump(meta_learner, 'meta_learner1.joblib')

# Cluster 5

### KNN

In [10]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')  # n_neighbors and weights
knn.fit(X5, y5)
knn_pred = knn.predict(X5)
knn_accuracy = accuracy_score(y5, knn_pred)
knn_conf = confusion_matrix(y5, knn_pred )

print(f"{knn_accuracy}")
print(f"{knn_conf}")

1.0
[[1215]]


### Random Forest 

In [11]:
rf = RandomForestClassifier(n_estimators=64, max_depth=12, random_state=42)  # number of trees and maximum depth
rf.fit(X5, y5)
rf_pred = rf.predict(X5)
rf_accuracy = accuracy_score(y5, rf_pred)
rf_conf = confusion_matrix(y5, rf_pred )

print(f"{rf_accuracy}")
print(f"{rf_conf}")

1.0
[[1215]]


### Multilayer Perceptron

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, max_iter=1000, random_state=42)  # Set hidden layer sizes, activation function, solver, regularization parameter alpha, and maximum number of iterations
mlp.fit(X5, y5)
mlp_pred = mlp.predict(X5)
mlp_accuracy = accuracy_score(y5, mlp_pred)
mlp_conf = confusion_matrix(y5, mlp_pred)

print(mlp_accuracy)
print("Confusion Matrix:")
print(mlp_conf)


1.0
Confusion Matrix:
[[1215]]


There are no bankrupt companies in this cluster, so we can not attempt classification.

### SECTION4 : K_Fold Cross Validation on the Whole Dataset

In [13]:
df = pd.read_csv('Section1&2(Clusters).csv')
df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,Cluster,Bankrupt?,Predicted_Cluster
0,2.65821,-1.40073,-0.381088,0.452337,0.858035,-0.012784,-0.381166,0.117564,0.221323,1,0,1
1,-1.428676,0.461506,-1.030601,-0.234852,-0.029953,-0.033227,-0.566027,0.076135,0.44051,5,0,5
2,-2.775424,1.442919,-0.711948,0.144689,-0.152528,0.16408,0.258936,-0.555779,-0.286176,5,0,5
3,2.403655,4.045684,0.326334,-1.356795,-0.638412,-0.146502,-0.058012,0.546713,-0.348422,0,0,0
4,0.865429,0.283041,0.526507,-0.195726,-0.437119,-0.979682,-0.145639,-0.450059,0.824183,0,0,0


In [14]:
X = df.iloc[:,0:9]
y = df.iloc[:,10]  # ~ bankrupt  

In [15]:
y.sum()/ len(y)

0.0340967797485793

Reminder to the proportion of our data that is bankrupt. K (K-folds/ n_splits) is set to 29 as m = 5807 / t = df[y].sum() = 198 is about 29

In [16]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# clf = DecisionTreeClassifier(random_state=42)
clf = RandomForestClassifier(n_estimators=64, random_state=42)

k_folds = KFold(n_splits = 29)

scores = cross_val_score(clf, X, y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.97512438 0.97014925 0.97014925 0.960199   0.95024876 0.9800995
 0.95024876 0.98       0.96       0.955      0.97       0.965
 0.965      0.94       0.975      0.99       0.945      0.975
 0.97       0.99       0.97       0.96       0.965      0.96
 0.95       0.98       0.965      0.975      0.965     ]
Average CV Score:  0.9664213415680221
Number of CV Scores used in Average:  29


We attain an average cross-validation score of 0.966 which is sufficient for model evaluation using k learning experiments.