In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

In [11]:
df = pd.read_csv('/Users/Bren/Coding/Bankruptcy prediction/TEJ updated2.csv')
pd.set_option('display.max_columns', None)

In [12]:
independant = df.drop(["Flag"], axis=1)
dependant = df["Flag"]
y = dependant
X = independant
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [13]:
sm = SMOTETomek(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

dummy_model = DummyClassifier(random_state=11)
dummy_model.fit(X_res, y_res)
dy_pred = dummy_model.predict(X_test)


X_test.shape, X_train.shape , y_test.shape, y_train.shape , dy_pred.shape, X_res.shape , y_res.shape

((1364, 95), (5455, 95), (1364,), (5455,), (1364,), (10576, 95), (10576,))

In [14]:
f1score = f1_score(y_test, dy_pred, average='macro')
print(f1score)

c_matrix = confusion_matrix(y_test, dy_pred)
print(c_matrix)

print(classification_report(y_pred = dy_pred, y_true = y_test, zero_division = 1))

0.4900934579439252
[[1311    0]
 [  53    0]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1311
           1       1.00      0.00      0.00        53

    accuracy                           0.96      1364
   macro avg       0.98      0.50      0.49      1364
weighted avg       0.96      0.96      0.94      1364



In [15]:
logisticclf = LogisticRegression(random_state=0, max_iter = 7600).fit(X_res,y_res)
lr_pred = logisticclf.predict(X_test)
f1score = f1_score(y_test, lr_pred, average='macro')
print(f1score)
lrcm = confusion_matrix(y_test, lr_pred)
print(lrcm)

0.6440206493994234
[[1155  156]
 [   8   45]]


In [16]:
rcf = RandomForestClassifier(max_depth=2, random_state=0)
rcf.fit(X_res,y_res)
rcf_pred = rcf.predict(X_test)
f1score = f1_score(y_test, rcf_pred, average='macro')
print(f1score)

c_matrix = confusion_matrix(y_test, rcf_pred)
print(c_matrix)
print(classification_report(y_pred = rcf_pred, y_true = y_test, zero_division = 1))

0.622564935064935
[[1142  169]
 [  11   42]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.93      1311
           1       0.20      0.79      0.32        53

    accuracy                           0.87      1364
   macro avg       0.59      0.83      0.62      1364
weighted avg       0.96      0.87      0.90      1364



In [17]:
gbc = GradientBoostingClassifier(n_estimators=100 ,learning_rate=1.0 ,max_depth=1 ,random_state=0 ).fit(X_res,y_res)
gbc.fit(X_res,y_res)
gbc_pred = gbc.predict(X_test)
f1score = f1_score(y_test, gbc_pred, average='macro')
print(f1score)

c_matrix = confusion_matrix(y_test, gbc_pred)
print(c_matrix)
print(classification_report(y_pred = gbc_pred, y_true = y_test, zero_division = 1))

0.6490879982231929
[[1187  124]
 [  15   38]]
              precision    recall  f1-score   support

           0       0.99      0.91      0.94      1311
           1       0.23      0.72      0.35        53

    accuracy                           0.90      1364
   macro avg       0.61      0.81      0.65      1364
weighted avg       0.96      0.90      0.92      1364



In [18]:
importance = pd.DataFrame()
importance["column"] = X_train.columns
importance["importance"] = gbc.feature_importances_
importance_sort = importance.sort_values(by = "importance", ascending = False)
importance_sort.head(15)

Unnamed: 0,column,importance
9,Recurring Interest Rate (After Tax),0.49739
48,Fixed Assets Number Of Turnovers,0.240513
39,Borrowing Dependence,0.152103
8,Industry External Income And Expenses/Revenue,0.026732
85,Net Income To Total Assets,0.021058
91,Degree Of Financial Leverage (Dfl),0.016555
40,Contingent Liabilities/Net Value,0.011092
35,Total Liabilities/Total Net Worth,0.006793
46,Average Collection Days,0.005717
80,Cash Flow To Liability,0.005684


In [19]:
from sklearn.model_selection import GridSearchCV


In [29]:
param_grid = {'n_estimators' : [5,10,25,50,100],
              'max_depth' :  [1,2,3,4,5],
              'max_features' : [9,11,13,15,17]
}

grid = GridSearchCV(GradientBoostingClassifier(),param_grid, scoring = "precision", n_jobs= 4, cv = 5)
grid.fit(X_res,y_res)

In [30]:
print(grid.best_params_)

{'max_depth': 5, 'max_features': 15, 'n_estimators': 100}


In [31]:
gbc_best = GradientBoostingClassifier(n_estimators=100 ,learning_rate=1.0 ,max_depth=5 ,max_features= 15,random_state=0 ).fit(X_res,y_res)
gbc_best.fit(X_res,y_res)
gbc_best_pred = gbc_best.predict(X_test)
f1score_best = f1_score(y_test, gbc_best_pred, average='macro')
print(f1score_best)

c_matrix_best = confusion_matrix(y_test, gbc_best_pred)
print(c_matrix_best)
print(classification_report(y_pred = gbc_best_pred, y_true = y_test, zero_division = 1))

0.7252864782276547
[[1280   31]
 [  27   26]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1311
           1       0.46      0.49      0.47        53

    accuracy                           0.96      1364
   macro avg       0.72      0.73      0.73      1364
weighted avg       0.96      0.96      0.96      1364

