***SECTION 3: Train Model 1: Stacking Method***

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [96]:
X = data[fifty_features]
y_cluster = data['Cluster_ID']

# Splitting data for training group model
X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = train_test_split(X, y_cluster, test_size=0.2, random_state=42)

In [97]:
# Model to predict the group
group_model = RandomForestClassifier(n_estimators=100, random_state=42)
group_model.fit(X_train_cluster, y_train_cluster)

In [98]:
# Evaluate the model
y_pred_cluster = group_model.predict(X_test_cluster)
print("Accuracy of group prediction model:", accuracy_score(y_test_cluster, y_pred_cluster))
print("Classification Report for Base Model:")
print(classification_report(y_test_cluster, y_pred_cluster, zero_division=0))
print("Confusion Matrix for Base Model:")
print(confusion_matrix(y_test_cluster, y_pred_cluster))

Accuracy of group prediction model: 0.9432013769363167
Classification Report for Base Model:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       321
           1       0.93      0.94      0.93       242
           2       0.95      0.95      0.95       598
           4       0.00      0.00      0.00         1

    accuracy                           0.94      1162
   macro avg       0.71      0.71      0.71      1162
weighted avg       0.94      0.94      0.94      1162

Confusion Matrix for Base Model:
[[298   3  20   0]
 [  3 228  11   0]
 [ 13  15 570   0]
 [  1   0   0   0]]


In [99]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Preparing data for bankruptcy prediction
y_bankruptcy = data['Bankrupt?']

# We should ideally use train_test_split here to ensure we're not leaking data across models
X_train, X_test, y_train, y_test = train_test_split(X, y_bankruptcy, test_size=0.2, random_state=42)

In [100]:
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42))
]

In [101]:
# Meta-model
meta_model = LogisticRegression()

In [102]:
# Stacking classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train, y_train)

In [103]:
# Evaluate the stacking model
y_pred = stacking_model.predict(X_test)
print("Accuracy of stacking model:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of stacking model: 0.96815834767642
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1124
           1       0.53      0.21      0.30        38

    accuracy                           0.97      1162
   macro avg       0.75      0.60      0.64      1162
weighted avg       0.96      0.97      0.96      1162



In [104]:
print("Final Report:")
print("Group Prediction Model Accuracy:", accuracy_score(y_test_cluster, y_pred_cluster))
print("Bankruptcy Prediction Model Accuracy:", accuracy_score(y_test, y_pred))

Final Report:
Group Prediction Model Accuracy: 0.9432013769363167
Bankruptcy Prediction Model Accuracy: 0.96815834767642


In [105]:
from sklearn.metrics import confusion_matrix

bankruptcy_conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix for Meta Model:")
print(bankruptcy_conf_mat)

Confusion Matrix for Meta Model:
[[1117    7]
 [  30    8]]


TN = 1117: The model correctly predicted 1117 companies as not going bankrupt.
FP = 7: The model incorrectly predicted 7 companies as going bankrupt when they did not.
FN = 30: The model incorrectly predicted 30 companies as not going bankrupt when they actually did.
TP = 8: The model correctly predicted 8 companies as going bankrupt.

High TN, Low TP: The model is very conservative in predicting bankruptcy

***SECTION 4: Train Model 2: k-fold Cross Validation***

In [108]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline

In [109]:
fifty_features

Index([' Net profit before tax/Paid-in capital',
       ' Net Income to Stockholder's Equity', ' Net worth/Assets',
       ' Equity to Liability', ' Liability to Equity', ' Cash/Total Assets',
       ' Net Value Per Share (C)', ' Interest Expense Ratio',
       ' Working Capital/Equity',
       ' Interest Coverage Ratio (Interest expense to EBIT)',
       ' Retained Earnings to Total Assets',
       ' Degree of Financial Leverage (DFL)',
       ' Continuous interest rate (after tax)', ' Net Income to Total Assets',
       ' Non-industry income and expenditure/revenue',
       ' Working Capital to Total Assets',
       ' ROA(B) before interest and depreciation after tax',
       ' Total income/Total expense', ' Operating profit/Paid-in capital',
       ' Operating profit per person', ' Current Liability to Current Assets',
       ' Equity to Long-term Liability', ' Inventory/Working Capital',
       ' Current Liability to Assets', ' No-credit Interval',
       ' Gross Profit to Sales', 

In [110]:
fifty_features = data.columns[1:35]  # Assuming feature names are correct
y = data['Bankrupt?']  # Ensure this is the correct target variable

In [111]:
# Setup pipeline with SMOTE and scaling
smote = SMOTE()
scaler = StandardScaler()
model = LogisticRegression(max_iter=1000, random_state=42)  # Increased max_iter

pipeline = IMBPipeline(steps=[
    ('smote', smote),
    ('scaler', scaler),  # Adding a scaler
    ('classifier', model)
])

In [112]:
# k-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(pipeline, data[selected_features], y, cv=kfold, scoring=make_scorer(accuracy_score))


In [113]:
# Output the results
print("Cross-validation accuracy scores:", cv_results)
print("Mean CV accuracy:", cv_results.mean())
pipeline.fit(X_train, y_train)  # Ensure fitting before prediction if not already done
y_pred_cv = pipeline.predict(X_test)
print("Confusion Matrix for K-Fold CV [TT(TF)]:")
print(confusion_matrix(y_test, y_pred_cv))

Cross-validation accuracy scores: [0.8777969  0.87693632 0.86907838 0.89750215 0.87252369]
Mean CV accuracy: 0.8787674878176419
Confusion Matrix for K-Fold CV [TT(TF)]:
[[986 138]
 [  6  32]]
