# Using PyCaret for banrkuptcy classification from [kaggle dataset](https://www.kaggle.com/datasets/fedesoriano/company-bankruptcy-prediction/data):

**Let's install all the required packages**

In [1]:
!pip install pycaret
#!pip install optuna==3.0.0
!pip install fastapi==0.100.0
import scipy.stats
import numpy as np
import pandas as pd
from pycaret.classification import *
from imblearn.combine import SMOTETomek



**Dowloading dataset and applying PyCaret**

In [2]:
# load data
dataset = pd.read_csv('/content/data.csv')

**Firstly, we will adjust our column names**

In [3]:
# Adjust column names
def clean_col_names(col_name):
    col_name = (
        col_name.strip()
        .replace("?", "_")
        .replace("(", "_")
        .replace(")", "_")
        .replace(" ", "_")
        .replace("/", "_")
        .replace("-", "_")
        .replace("__", "_")
        .replace("'", "")
        .lower()
    )
    return col_name

In [4]:
features_columns = list(dataset.columns)
features_columns = [clean_col_names(col_name) for col_name in features_columns]
dataset.columns = features_columns
display(dataset.columns)

Index(['bankrupt_', 'roa_c_before_interest_and_depreciation_before_interest',
       'roa_a_before_interest_and_%_after_tax',
       'roa_b_before_interest_and_depreciation_after_tax',
       'operating_gross_margin', 'realized_sales_gross_margin',
       'operating_profit_rate', 'pre_tax_net_interest_rate',
       'after_tax_net_interest_rate',
       'non_industry_income_and_expenditure_revenue',
       'continuous_interest_rate_after_tax_', 'operating_expense_rate',
       'research_and_development_expense_rate', 'cash_flow_rate',
       'interest_bearing_debt_interest_rate', 'tax_rate_a_',
       'net_value_per_share_b_', 'net_value_per_share_a_',
       'net_value_per_share_c_', 'persistent_eps_in_the_last_four_seasons',
       'cash_flow_per_share', 'revenue_per_share_yuan_¥_',
       'operating_profit_per_share_yuan_¥_',
       'per_share_net_profit_before_tax_yuan_¥_',
       'realized_sales_gross_profit_growth_rate',
       'operating_profit_growth_rate', 'after_tax_net_profit

**Now we will remove outliers (25th and 75th quartiles)**

In [5]:
# Outliers removal

def outliers_removal(feature,feature_name,dataset):

    # Identify 25th & 75th quartiles

    q25, q75 = np.percentile(feature, 25), np.percentile(feature, 75)
    print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
    feat_iqr = q75 - q25
    print('iqr: {}'.format(feat_iqr))

    feat_cut_off = feat_iqr * 1.5
    feat_lower, feat_upper = q25 - feat_cut_off, q75 + feat_cut_off
    print('Cut Off: {}'.format(feat_cut_off))
    print(feature_name +' Lower: {}'.format(feat_lower))
    print(feature_name +' Upper: {}'.format(feat_upper))

    outliers = [x for x in feature if x < feat_lower or x > feat_upper]
    print(feature_name + ' outliers for close to bankruptcy cases: {}'.format(len(outliers)))
    #print(feature_name + ' outliers:{}'.format(outliers))

    dataset = dataset.drop(dataset[(dataset[feature_name] > feat_upper) | (dataset[feature_name] < feat_lower)].index)
    print('-' * 50)

    return dataset

for col in dataset:
    new_df = outliers_removal(dataset[col],str(col),dataset)

Quartile 25: 0.0 | Quartile 75: 0.0
iqr: 0.0
Cut Off: 0.0
bankrupt_ Lower: 0.0
bankrupt_ Upper: 0.0
bankrupt_ outliers for close to bankruptcy cases: 220
--------------------------------------------------
Quartile 25: 0.476527080388047 | Quartile 75: 0.535562813825379
iqr: 0.05903573343733198
Cut Off: 0.08855360015599797
roa_c_before_interest_and_depreciation_before_interest Lower: 0.38797348023204903
roa_c_before_interest_and_depreciation_before_interest Upper: 0.624116413981377
roa_c_before_interest_and_depreciation_before_interest outliers for close to bankruptcy cases: 391
--------------------------------------------------
Quartile 25: 0.53554295682512 | Quartile 75: 0.58915721761884
iqr: 0.05361426079372
Cut Off: 0.08042139119058
roa_a_before_interest_and_%_after_tax Lower: 0.45512156563454
roa_a_before_interest_and_%_after_tax Upper: 0.66957860880942
roa_a_before_interest_and_%_after_tax outliers for close to bankruptcy cases: 561
-------------------------------------------------

In [None]:
new_df.head()

**We will find skewness of dataset and remove in case it is here**

In [6]:
skew_df = pd.DataFrame(dataset.select_dtypes(np.number).columns, columns = ['Feature'])

skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(dataset[feature]))

skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)  # to obtain magnitude of skew irrelevant of direction
display(skew_df)

Unnamed: 0,Feature,Skew,Absolute Skew
0,bankrupt_,5.294223,5.294223
1,roa_c_before_interest_and_depreciation_before_...,-0.323870,0.323870
2,roa_a_before_interest_and_%_after_tax,-1.033499,1.033499
3,roa_b_before_interest_and_depreciation_after_tax,-0.763396,0.763396
4,operating_gross_margin,-8.041599,8.041599
...,...,...,...
91,liability_to_equity,27.453426,27.453426
92,degree_of_financial_leverage_dfl_,45.714138,45.714138
93,interest_coverage_ratio_interest_expense_to_ebit_,-13.936461,13.936461
94,net_income_flag,0.000000,0.000000


In [7]:
X = new_df.drop(['bankrupt_'], axis = 1)
Y = new_df['bankrupt_']

def log_trans(data):
    for col in data:
        skew = data[col].skew()
        if skew>=0.5 or skew<=0.5:
            data[col] = np.log1p(data[col])
        else:
            continue
    return data

X_log = log_trans(X)
X_log.head()

Unnamed: 0,roa_c_before_interest_and_depreciation_before_interest,roa_a_before_interest_and_%_after_tax,roa_b_before_interest_and_depreciation_after_tax,operating_gross_margin,realized_sales_gross_margin,operating_profit_rate,pre_tax_net_interest_rate,after_tax_net_interest_rate,non_industry_income_and_expenditure_revenue,continuous_interest_rate_after_tax_,...,net_income_to_total_assets,total_assets_to_gnp_price,no_credit_interval,gross_profit_to_sales,net_income_to_stockholders_equity,liability_to_equity,degree_of_financial_leverage_dfl_,interest_coverage_ratio_interest_expense_to_ebit_,net_income_flag,equity_to_liability
0,0.315244,0.353743,0.340571,0.470914,0.470914,0.692632,0.586056,0.592669,0.264398,0.577166,...,0.540489,0.009177,0.484202,0.470912,0.603162,0.254799,0.026253,0.447279,0.693147,0.016335
1,0.381371,0.430622,0.416557,0.47638,0.47638,0.69262,0.58633,0.59294,0.265096,0.577459,...,0.585171,0.008289,0.484678,0.476381,0.609749,0.24986,0.234738,0.451187,0.693147,0.020581
2,0.354923,0.404811,0.386822,0.470909,0.470855,0.692576,0.585787,0.592436,0.263929,0.576773,...,0.573614,0.039223,0.484794,0.470909,0.608011,0.254788,0.026208,0.447059,0.693147,0.01634
3,0.336361,0.372435,0.376883,0.459664,0.459664,0.692497,0.5861,0.592755,0.264938,0.57731,...,0.553629,0.003247,0.484232,0.459661,0.606879,0.248204,0.026346,0.447671,0.693147,0.023699
4,0.38187,0.430764,0.420221,0.469243,0.469243,0.692634,0.586322,0.592942,0.265034,0.577484,...,0.585014,0.00387,0.484597,0.469242,0.609751,0.245698,0.02445,0.454647,0.693147,0.034875


**Our data is highly imbalanced towards more bankruptcy cases. However, we can change it with SMOTETomek (you can read about it in sklearn package)**

In [8]:
smotetomek = SMOTETomek(random_state=42)
X_train_over , y_train_over = smotetomek.fit_resample(X_log , Y)
print(X_train_over.shape)
X_train_over.head()

(12054, 95)


Unnamed: 0,roa_c_before_interest_and_depreciation_before_interest,roa_a_before_interest_and_%_after_tax,roa_b_before_interest_and_depreciation_after_tax,operating_gross_margin,realized_sales_gross_margin,operating_profit_rate,pre_tax_net_interest_rate,after_tax_net_interest_rate,non_industry_income_and_expenditure_revenue,continuous_interest_rate_after_tax_,...,net_income_to_total_assets,total_assets_to_gnp_price,no_credit_interval,gross_profit_to_sales,net_income_to_stockholders_equity,liability_to_equity,degree_of_financial_leverage_dfl_,interest_coverage_ratio_interest_expense_to_ebit_,net_income_flag,equity_to_liability
0,0.315244,0.353743,0.340571,0.470914,0.470914,0.692632,0.586056,0.592669,0.264398,0.577166,...,0.540489,0.009177,0.484202,0.470912,0.603162,0.254799,0.026253,0.447279,0.693147,0.016335
1,0.381371,0.430622,0.416557,0.47638,0.47638,0.69262,0.58633,0.59294,0.265096,0.577459,...,0.585171,0.008289,0.484678,0.476381,0.609749,0.24986,0.234738,0.451187,0.693147,0.020581
2,0.354923,0.404811,0.386822,0.470909,0.470855,0.692576,0.585787,0.592436,0.263929,0.576773,...,0.573614,0.039223,0.484794,0.470909,0.608011,0.254788,0.026208,0.447059,0.693147,0.01634
3,0.336361,0.372435,0.376883,0.459664,0.459664,0.692497,0.5861,0.592755,0.264938,0.57731,...,0.553629,0.003247,0.484232,0.459661,0.606879,0.248204,0.026346,0.447671,0.693147,0.023699
4,0.38187,0.430764,0.420221,0.469243,0.469243,0.692634,0.586322,0.592942,0.265034,0.577484,...,0.585014,0.00387,0.484597,0.469242,0.609751,0.245698,0.02445,0.454647,0.693147,0.034875


**Finally, it is time to use pycaret package to get the best models for our dataset**

In [9]:
clf1 = setup(data = X_train_over, target = y_train_over, session_id = 42)
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,bankrupt_
2,Target type,Binary
3,Original data shape,"(12054, 96)"
4,Transformed data shape,"(12054, 96)"
5,Transformed train set shape,"(8437, 96)"
6,Transformed test set shape,"(3617, 96)"
7,Numeric features,95
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9821,0.9984,0.995,0.97,0.9823,0.9642,0.9646,1.555
lightgbm,Light Gradient Boosting Machine,0.9797,0.9982,0.9957,0.9649,0.9801,0.9595,0.96,8.088
et,Extra Trees Classifier,0.9788,0.9991,0.9967,0.9623,0.9792,0.9576,0.9582,1.065
rf,Random Forest Classifier,0.9733,0.9975,0.9922,0.9563,0.9739,0.9467,0.9474,6.808
gbc,Gradient Boosting Classifier,0.9554,0.9898,0.9735,0.9397,0.9562,0.9109,0.9116,22.664
dt,Decision Tree Classifier,0.9531,0.9531,0.9661,0.9416,0.9537,0.9061,0.9065,1.705
ada,Ada Boost Classifier,0.9364,0.9812,0.9486,0.9262,0.9372,0.8727,0.8732,4.37
knn,K Neighbors Classifier,0.9175,0.967,0.9765,0.8737,0.9222,0.835,0.8411,0.207
lr,Logistic Regression,0.8934,0.9533,0.9111,0.8802,0.8954,0.7869,0.7875,2.504
lda,Linear Discriminant Analysis,0.8917,0.9534,0.9161,0.8738,0.8943,0.7833,0.7845,0.188


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

**Now we will tune our model and check whether it will yield higher results**

In [10]:
model = create_model(best_model)
tuned_model = tune_model(model) #tuned_model = tune_model(model, n_iter = 50, search_library = 'optuna')
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9751,0.9989,0.9953,0.9567,0.9756,0.9502,0.951
1,0.9834,0.9994,0.9953,0.9722,0.9836,0.9668,0.9671
2,0.9787,0.9994,0.9976,0.9612,0.9791,0.9573,0.958
3,0.9822,0.9947,0.9929,0.9722,0.9824,0.9645,0.9647
4,0.9846,0.9993,0.9882,0.9812,0.9847,0.9692,0.9692
5,0.9787,0.9959,0.9929,0.9654,0.979,0.9573,0.9577
6,0.987,0.9994,1.0,0.9746,0.9871,0.9739,0.9743
7,0.9881,0.9995,0.9952,0.9813,0.9882,0.9763,0.9764
8,0.9834,0.9998,1.0,0.9679,0.9837,0.9668,0.9673
9,0.9798,0.9982,0.9929,0.9677,0.9801,0.9597,0.96


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9585,0.9967,1.0,0.9234,0.9602,0.9171,0.9202
1,0.9739,0.9994,0.9953,0.9545,0.9745,0.9479,0.9487
2,0.9751,0.9986,1.0,0.9526,0.9757,0.9502,0.9514
3,0.9739,0.9969,0.9976,0.9525,0.9745,0.9479,0.9489
4,0.9716,0.9983,0.9953,0.9502,0.9722,0.9431,0.9442
5,0.955,0.9964,0.9976,0.9192,0.9568,0.91,0.9133
6,0.9692,0.9989,1.0,0.942,0.9701,0.9384,0.9402
7,0.9763,0.998,0.9976,0.9567,0.9767,0.9526,0.9534
8,0.9692,0.9993,1.0,0.942,0.9701,0.9383,0.9401
9,0.968,0.9977,0.9953,0.9438,0.9689,0.9359,0.9373


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


**Original model was better than the tuned model, hence not tuned model will be used**

**Let's make api for the future datasets**

In [15]:
create_api(final_model, 'bankruptcy_api')

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python bankruptcy_api.py


**And make docker image if needed**

In [18]:
create_docker('bankruptcy_api')

Writing requirements.txt
Writing Dockerfile
Dockerfile and requirements.txt successfully created.
    To build image you have to run --> !docker image build -f "Dockerfile" -t IMAGE_NAME:IMAGE_TAG .
            


In [19]:
!docker image build -f "Dockerfile" -t bankruptcy:latest

/bin/bash: line 1: docker: command not found


*We built our model to be as accurate as possible, but that may not be what we need in the real world. When we analyze the risks associated with certain companies, we often want to get the probability of default, not accurately predict whether a company will go bankrupt. To do this, we need to calibrate our probabilities or use models that are already calibrated. However, since we have eliminated the imbalance of the dataset, and our best performing model is xgboost, it would be wise to simply use an already calibrated model with the highest accuracy, namely, in our case, logistic regression*

# Now let's build model for prediction of probability of bankruptcy

**Let's tune our logistic regression model**

In [None]:
prob_model = create_model('lr')
prob_tuned_model = tune_model(prob_model) #prob_tuned_model = tune_model(prob_model, n_iter = 50, search_library = 'optuna')
prob_final_model = finalize_model(prob_tuned_model)

**And, finally, make docker image**

In [None]:
create_api(prob_final_model, 'prob_bankruptcy_api')

In [None]:
create_docker('prob_bankruptcy_api')