In [1]:
import pandas as pd

DATA = '/kaggle/input/predict-bankruptcy-in-poland/data.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A57,A58,A59,A60,A61,A62,A63,A64,class,year
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,0,1
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,0,1
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,0,1
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,0,1
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,0,1


We expect bankruptcies to be rare, so we expect our target class to be unbalanced.

In [2]:
df['class'].value_counts(normalize=True).to_dict()

{0: 0.9518258265176823, 1: 0.04817417348231771}

Only about 5% of our instances are bankruptcies.

Let's look at the distribution of cases across the years.

In [3]:
from plotly import express
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=df, x='year', color='class', facet_col='class')

Our cases are more normally than unformly distributed across the years.

Let's do some dimension reduction as part of our EDA; we want to see if bankruptcies are randomly distributed or if they are clustered in a way a dimension reduction model can easily find.

In [4]:
import arrow
from umap import UMAP

COLUMNS = [column for column in df.columns if column.startswith('A')] + ['year']
TARGET = 'class'
time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS].fillna(value=0))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-22 14:52:19.524720: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-22 14:52:19.524858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-22 14:52:19.766566: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Mon Jul 22 14:52:33 2024 Construct fuzzy simplicial set
Mon Jul 22 14:52:33 2024 Finding Nearest Neighbors
Mon Jul 22 14:52:33 2024 Building RP forest with 15 trees
Mon Jul 22 14:52:39 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	Stopping threshold met -- exiting after 4 iterations
Mon Jul 22 14:53:03 2024 Finished Nearest Neighbor Search
Mon Jul 22 14:53:07 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Jul 22 14:54:32 2024 Finished embedding
done with UMAP in 0:02:00.049281


In [5]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=3000, random_state=2024), x='x', y='y', color=TARGET, facet_col=TARGET)

This is not entirely encouraging; there are places where non-bankruptcies are denser, but by and large bankruptcies appear to be more or less evenly or randomly distributed across our UMAP plot.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS].fillna(value=0), df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-4).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 10000 iterations
accuracy: 0.9512



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Our model has what appears to be very good accuracy, but did it find any of the bankruptcies?

In [7]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8263
           1       0.29      0.01      0.02       418

    accuracy                           0.95      8681
   macro avg       0.62      0.50      0.50      8681
weighted avg       0.92      0.95      0.93      8681



No not really. Our model did very little better than a dummy model that always chose the majority (non-bankruptcy) class.

In [8]:
express.histogram(x=COLUMNS, y=model.coef_[0])

If we look at the regression coefficients they mostly look like noise. Let's try a more complicated model.

In [9]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=5, random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0, y_true=y_test, y_pred=tree.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      8263
           1       0.90      0.34      0.49       418

    accuracy                           0.97      8681
   macro avg       0.93      0.67      0.74      8681
weighted avg       0.96      0.97      0.96      8681



In [10]:
from sklearn.metrics import f1_score
print('f1: {:5.4f}'.format(f1_score(average='binary', y_true=y_test, y_pred=tree.predict(X=X_test))))

f1: 0.4904


We can do better but still not especially well with our tree model.

In [11]:
from plotly import express

express.histogram(x=tree.feature_names_in_, y=tree.feature_importances_)