In [1]:
# 📘 Assignment 1: Fintech Firm AI Transition - Full Pipeline Code

In [77]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, mean_squared_error, silhouette_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import joblib
import warnings
warnings.filterwarnings("ignore")

In [78]:
# ✅ Step 2: Load Dataset
df = pd.read_csv("/content/data.csv")
print("Dataset Shape:",df.shape)
display(df.head())

Dataset Shape: (6819, 96)


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [79]:
# Step 3: Preprocessing & Cleaning
print("\nData Overview")
print(df.info())
print(df.describe())
print("\nMissing Values:\n", df.isnull().sum())

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


Data Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    Af

In [80]:
# Step 4: Business Strategy Formation (Segmentation)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imputed.drop(columns=['Bankrupt?']))

kmeans = KMeans(n_clusters=3, random_state=42)
df_imputed['Cluster'] = kmeans.fit_predict(df_scaled)

cluster_means = df_imputed.groupby('Cluster').mean()
print("\nCluster Profiles:\n")
display(cluster_means)


Cluster Profiles:



Unnamed: 0_level_0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000828,0.554429,0.609342,0.601845,0.615563,0.615504,0.999091,0.797757,0.809632,0.303911,...,0.836318,8318841.0,0.62428,0.615561,0.842779,0.277921,0.026817,0.565272,1.0,0.06509
1,0.048864,0.478305,0.530974,0.527262,0.60378,0.603784,0.99857,0.796878,0.808783,0.303465,...,0.792223,24305450.0,0.623714,0.603779,0.839513,0.28127,0.02794,0.565406,1.0,0.038001
2,0.75,0.333021,0.354217,0.378272,0.59493,0.594949,0.998769,0.796842,0.808817,0.302986,...,0.656909,0.001658319,0.623967,0.594929,0.38395,0.760348,0.026719,0.564768,1.0,0.010446


In [81]:
# Step 5: Feature Engineering
# Add domain-specific features
# Ensure column name is stripped of whitespaces to prevent KeyError
col_map = {col.strip(): col for col in df_imputed.columns}

df_imputed['High_Debt_Equity'] = (df_imputed[col_map['Liability to Equity']] > 0.5).astype(int)
df_imputed['Low_Profit_Margin'] = (df_imputed[col_map['Gross Profit to Sales']] < 0.3).astype(int)
df_imputed['High_Risk_Flag'] = ((df_imputed['High_Debt_Equity'] == 1) & (df_imputed['Low_Profit_Margin'] == 1)).astype(int)


In [82]:
# Step 6: Classification Task (Target = 'Bankrupt?')
if 'Bankrupt?' in df.columns:
    print("\nStarting Classification Task")
    X_cls = df_imputed.drop(['Bankrupt?', 'Cluster'], axis=1)
    y_cls = df_imputed['Bankrupt?']

    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
        X_cls, y_cls, test_size=0.2, stratify=y_cls, random_state=42)

    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_cls, y_train_cls)

    # Grid Search for best Random Forest
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring='f1', cv=3)
    grid.fit(X_train_res, y_train_res)

    best_clf = grid.best_estimator_
    y_pred_cls = best_clf.predict(X_test_cls)

    print("\nClassification Report (Random Forest):")
    print(classification_report(y_test_cls, y_pred_cls))

    # Save the trained model
    joblib.dump(best_clf, "bankruptcy_rf_model.pkl")
    print("\nRandom Forest model saved as 'bankruptcy_rf_model.pkl'")

    # Additional: Compare with Logistic Regression
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train_res, y_train_res)
    y_logreg = logreg.predict(X_test_cls)

    print("\nClassification Report (Logistic Regression):")
    print(classification_report(y_test_cls, y_logreg))


Starting Classification Task

Classification Report (Random Forest):
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      1320
         1.0       0.43      0.59      0.50        44

    accuracy                           0.96      1364
   macro avg       0.71      0.78      0.74      1364
weighted avg       0.97      0.96      0.96      1364


Random Forest model saved as 'bankruptcy_rf_model.pkl'

Classification Report (Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.97      0.77      0.86      1320
         1.0       0.05      0.34      0.08        44

    accuracy                           0.76      1364
   macro avg       0.51      0.56      0.47      1364
weighted avg       0.94      0.76      0.84      1364



In [83]:
# Step 7: Regression Task
# Skipped - no continuous target variable (e.g. Revenue, Profit Amount) available in dataset
# You can implement this step if future data contains financial amounts for prediction


In [84]:
# Step 8: Clustering Evaluation
sil_score = silhouette_score(df_scaled, df_imputed['Cluster'])
print("\nSilhouette Score (Clustering Quality):", sil_score)
if sil_score < 0.2:
    print("\nThe Silhouette Score is quite low, suggesting that the clusters may not be well separated. Consider re-evaluating the number of clusters or features used.")

    print("\nTesting different cluster counts for better Silhouette Score:")
    for k in range(2, 6):
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(df_scaled)
        score = silhouette_score(df_scaled, labels)
        print(f"Clusters: {k}, Silhouette Score: {score}")

    # Try PCA before clustering
    pca = PCA(n_components=10)
    df_pca = pca.fit_transform(df_scaled)

    kmeans_pca = KMeans(n_clusters=3, random_state=42)
    pca_clusters = kmeans_pca.fit_predict(df_pca)

    print("\nSilhouette Score after PCA:", silhouette_score(df_pca, pca_clusters))
else:
    print("\nThe Silhouette Score indicates decent clustering structure.")



Silhouette Score (Clustering Quality): 0.11513178412820485

The Silhouette Score is quite low, suggesting that the clusters may not be well separated. Consider re-evaluating the number of clusters or features used.

Testing different cluster counts for better Silhouette Score:
Clusters: 2, Silhouette Score: 0.10895855518535555
Clusters: 3, Silhouette Score: 0.11513178412820485
Clusters: 4, Silhouette Score: 0.10815158949870422
Clusters: 5, Silhouette Score: 0.06944693758830818

Silhouette Score after PCA: 0.18582446459288757


In [85]:
# Step 9: Save Cleaned Dataset for Dashboard
final_data = df_imputed.copy()
final_data.to_csv("cleaned_fintech_data.csv", index=False)

print("\nAssignment 1 pipeline completed successfully.")



Assignment 1 pipeline completed successfully.
