### Import necessary libraries



In [9]:
import sys
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.vis_utils import plot_model
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


### Extract data from csv file

In [10]:
# Load the data from CSV file
df = pd.read_csv("bankruptcy.csv")
df.head(400)

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.405750,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.827890,0.290202,0.026601,0.564050,1,0.016469
1,1,0.464291,0.538214,0.516730,0.610235,0.610235,0.998946,0.797380,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.601450,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.774670,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.303350,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0,0.481743,0.512702,0.534772,0.600001,0.600001,0.998974,0.797319,0.809246,0.303391,...,0.782129,0.002719,0.624481,0.599996,0.839156,0.277289,0.026478,0.563039,1,0.047357
396,0,0.470238,0.533962,0.522833,0.598286,0.598286,0.998982,0.797385,0.809310,0.303490,...,0.796491,0.004123,0.623464,0.598281,0.840077,0.277855,0.029211,0.568666,1,0.040730
397,0,0.507337,0.552987,0.562182,0.599937,0.599937,0.999022,0.797444,0.809362,0.303508,...,0.807084,0.012759,0.624148,0.599935,0.840887,0.279029,0.026973,0.565900,1,0.032497
398,0,0.526057,0.601068,0.575673,0.609024,0.609089,0.999132,0.797810,0.809665,0.303919,...,0.837170,0.012565,0.623716,0.609020,0.842199,0.275613,0.026802,0.565211,1,0.114083


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

### Data Cleaning

Checking and removing NULL or MISSING values within the dataframe.

In [12]:
cleaned_df = df.copy()
cleaned_df = cleaned_df.dropna()

In [13]:
print(cleaned_df.isnull().values.any())

False


In [14]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

I am working with a dataset that requires a high level of precision in the floating-point values. In this case, using float64 is necessary to ensure that the calculations and analysis performed on the data are accurate. While float32 would require less memory, the trade-off in precision is not worth it for this particular dataset. Therefore, I am using float64 to ensure the necessary level of precision for my analysis.

In [15]:
cleaned_df.to_csv('cleaned_bankruptcy.csv', index=False)