# Company Bankruptcy — Classification Task
Two algorithms: Logistic Regression and Random Forest Classifier. We address class imbalance via class weights.

In [None]:
from pathlib import Path
DATA_PATH = Path('../data')
CSV_FILE = DATA_PATH / 'classification.csv'
CSV_FILE

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (mean_squared_error, r2_score, classification_report,
                             confusion_matrix, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay)

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.utils import compute_class_weight

from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)


## Load & Inspect

In [None]:

df = pd.read_csv(CSV_FILE)
# Expect target 'Y' (0/1) and many numeric X1..X95
target = 'Y'
y = df[target].astype(int)
X = df.drop(columns=[target])

# Identify numeric/categorical
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

print('Shape:', df.shape)
print('Positive rate:', y.mean().round(4))
df.head()


## Train/Test Split & Preprocessing

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
                                ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_cols),
                                  ('cat', categorical_transformer, categorical_cols)])


## Model 1 — Logistic Regression (with class_weight)

In [None]:

# Calculate balanced class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = {c:w for c,w in zip(classes, weights)}

logreg = Pipeline([('prep', preprocessor),
                   ('model', LogisticRegression(max_iter=1000, class_weight=class_weight, solver='lbfgs'))])
grid = {'model__C':[0.1, 1, 10]}
log_cv = GridSearchCV(logreg, grid, scoring='roc_auc', cv=5, n_jobs=-1)
log_cv.fit(X_train, y_train)
log_best = log_cv.best_estimator_
print('Best params:', log_cv.best_params_)

y_pred = log_best.predict(X_test)
y_prob = log_best.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred, digits=3))
print('ROC-AUC:', roc_auc_score(y_test, y_prob).round(3))
RocCurveDisplay.from_predictions(y_test, y_prob); plt.show()
PrecisionRecallDisplay.from_predictions(y_test, y_prob); plt.show()


## Model 2 — Random Forest Classifier (with class_weight='balanced')

In [None]:

rf = Pipeline([('prep', preprocessor),
               ('model', RandomForestClassifier(class_weight='balanced', random_state=42))])
grid = {'model__n_estimators':[400, 700],
        'model__max_depth':[None, 12, 20],
        'model__min_samples_split':[2, 5]}
rf_cv = GridSearchCV(rf, grid, scoring='roc_auc', cv=5, n_jobs=-1)
rf_cv.fit(X_train, y_train)
rf_best = rf_cv.best_estimator_
print('Best params:', rf_cv.best_params_)

y_pred = rf_best.predict(X_test)
y_prob = rf_best.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred, digits=3))
print('ROC-AUC:', roc_auc_score(y_test, y_prob).round(3))
RocCurveDisplay.from_predictions(y_test, y_prob); plt.show()
PrecisionRecallDisplay.from_predictions(y_test, y_prob); plt.show()


## Confusion Matrices

In [None]:

def plot_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots()
    im = ax.imshow(cm)
    ax.set_title(title)
    ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
    for (i,j), z in np.ndenumerate(cm):
        ax.text(j, i, str(z), ha='center', va='center')
    plt.show()

plot_cm(y_test, log_best.predict(X_test), 'Logistic Regression — CM')
plot_cm(y_test, rf_best.predict(X_test), 'RandomForest — CM')


## Feature Importance (RF) & Save outputs

In [None]:

# Feature names after one-hot
ohe = rf_best.named_steps['prep'].named_transformers_['cat'].named_steps.get('onehot', None)
cat_names = ohe.get_feature_names_out(categorical_cols) if (ohe is not None and len(categorical_cols)) else []
feature_names = list(numeric_cols) + list(cat_names)

importances = rf_best.named_steps['model'].feature_importances_
fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(20)
fi.plot(kind='barh', x='feature', y='importance'); plt.gca().invert_yaxis(); plt.title('Top 20 Feature Importances — RF'); plt.show()
fi.to_csv(Path('../reports/feature_importance_bankruptcy.csv'), index=False)

out = pd.DataFrame({'y_true': y_test,
                    'logreg_pred': log_best.predict(X_test),
                    'rf_pred': rf_best.predict(X_test)})
out.to_csv(Path('../reports/bankruptcy_predictions.csv'), index=False)
out.head()
