# IMPORTING LIBRARIES

In [2]:
# General-purpose libraries for operating system interactions, JSON data manipulation, and date/time handling.
import os  # Interact with the operating system
import json  # Work with JSON data
from random import randint  # Generate random integers
from time import sleep  # Introduce delays in code execution

from datetime import datetime, timedelta  # Manipulate dates and times


# Fundamental scientific computing libraries like NumPy and Pandas.
import numpy as np  # Fundamental package for scientific computing
import pandas as pd  # Data manipulation library


# Specialized libraries for scientific computing, namely SciPy
import scipy  # Scientific computing and technical computing library
import scipy.stats as st

# Web-related tasks, such as making HTTP requests, parsing HTML/XML data, and web scraping
import requests as re  # Perform HTTP requests
from bs4 import BeautifulSoup  # Parse HTML and XML data


# Pretty-print Python data structures
import pprint


# Visualization oriented
import matplotlib.pyplot as plt  # Create static visualizations
import seaborn as sns  # Statistical data visualization
from plotly import express as px  # Create interactive plots and charts


# Machine learning : preprocessing, dimensionality reduction, one-hot encoding, and clustering
import sklearn  # Machine learning library
from sklearn.preprocessing import StandardScaler  # Standardize features
from sklearn.decomposition import PCA  # Perform dimensionality reduction
from sklearn.preprocessing import OneHotEncoder  # One-hot encode categorical features
from sklearn.cluster import KMeans  # Perform clustering

from model_scoring import scoring


#Handling Imbalanced Datasets
import imblearn  # Handle imbalanced datasets in machine learning


#Web Scraping
import scrapy  # Web scraping framework


#Deep Learning
# import pytorch  # Deep learning library

# IMPORTING DATA

In [3]:
cleaned_data = pd.read_csv("cleaned_data.csv")

In [4]:
df = cleaned_data
df

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,Operating Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,Non-industry income and expenditure/revenue,Operating Expense Rate,Research and development expense rate,Cash flow rate,...,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Total assets to GNP price,No-credit Interval,Net Income to Stockholder's Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.601457,0.998969,0.796887,0.302646,1.256969e-04,0.000000e+00,0.458143,...,0.312905,0.118250,0,0.009219,0.622879,0.827890,0.026601,0.564050,1,0.016469
1,1,0.464291,0.538214,0.610235,0.998946,0.797380,0.303556,2.897851e-04,0.000000e+00,0.461867,...,0.314163,0.047775,0,0.008323,0.623652,0.839969,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.601450,0.998857,0.796403,0.302035,2.361297e-04,2.550000e+07,0.458521,...,0.314515,0.025346,0,0.040003,0.623841,0.836774,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.583541,0.998700,0.796967,0.303350,1.078888e-04,0.000000e+00,0.465705,...,0.302382,0.067250,0,0.003252,0.622929,0.834697,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.598783,0.998973,0.797366,0.303475,7.890000e+09,0.000000e+00,0.462746,...,0.311567,0.047725,0,0.003878,0.623521,0.839973,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.493687,0.539468,0.604455,0.998992,0.797409,0.303510,1.510213e-04,4.500000e+09,0.463734,...,0.314063,0.027951,0,0.000466,0.623620,0.840359,0.027064,0.566193,1,0.029890
6815,0,0.475162,0.538269,0.598308,0.998992,0.797414,0.303520,5.220000e+09,1.440000e+09,0.461978,...,0.314446,0.031470,0,0.001959,0.623931,0.840306,0.027009,0.566018,1,0.038284
6816,0,0.472725,0.533744,0.610444,0.998984,0.797401,0.303512,2.509312e-04,1.039086e-04,0.472189,...,0.313353,0.007542,0,0.002840,0.624156,0.840138,0.026791,0.565158,1,0.097649
6817,0,0.506264,0.559911,0.607850,0.999074,0.797500,0.303498,1.236154e-04,2.510000e+09,0.476123,...,0.320118,0.022916,0,0.002837,0.623957,0.841084,0.026822,0.565302,1,0.044009


In [5]:
column_names = list(df.columns)
print(column_names)

['Bankrupt?', ' ROA(C) before interest and depreciation before interest', ' ROA(A) before interest and % after tax', ' Operating Gross Margin', ' Operating Profit Rate', ' Pre-tax net Interest Rate', ' Non-industry income and expenditure/revenue', ' Operating Expense Rate', ' Research and development expense rate', ' Cash flow rate', ' Interest-bearing debt interest rate', ' Tax rate (A)', ' Net Value Per Share (B)', ' Persistent EPS in the Last Four Seasons', ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)', ' Operating Profit Per Share (Yuan ¥)', ' Realized Sales Gross Profit Growth Rate', ' Operating Profit Growth Rate', ' After-tax Net Profit Growth Rate', ' Continuous Net Profit Growth Rate', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Total Asset Return Growth Rate Ratio', ' Cash Reinvestment %', ' Current Ratio', ' Quick Ratio', ' Interest Expense Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)', ' Borrowing depende

Step 2: Prepare your data
Next, you'll need to prepare your data for training the DecisionTreeClassifier. Ensure that your data is in the correct format and split it into features (X) and labels (y).

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Extract features (X) and labels (y) from your DataFrame
X = df.drop('Bankrupt?', axis=1)  # Replace 'Bankrupt?' with the actual column name
y = df['Bankrupt?']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
df

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,Operating Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,Non-industry income and expenditure/revenue,Operating Expense Rate,Research and development expense rate,Cash flow rate,...,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Total assets to GNP price,No-credit Interval,Net Income to Stockholder's Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.601457,0.998969,0.796887,0.302646,1.256969e-04,0.000000e+00,0.458143,...,0.312905,0.118250,0,0.009219,0.622879,0.827890,0.026601,0.564050,1,0.016469
1,1,0.464291,0.538214,0.610235,0.998946,0.797380,0.303556,2.897851e-04,0.000000e+00,0.461867,...,0.314163,0.047775,0,0.008323,0.623652,0.839969,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.601450,0.998857,0.796403,0.302035,2.361297e-04,2.550000e+07,0.458521,...,0.314515,0.025346,0,0.040003,0.623841,0.836774,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.583541,0.998700,0.796967,0.303350,1.078888e-04,0.000000e+00,0.465705,...,0.302382,0.067250,0,0.003252,0.622929,0.834697,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.598783,0.998973,0.797366,0.303475,7.890000e+09,0.000000e+00,0.462746,...,0.311567,0.047725,0,0.003878,0.623521,0.839973,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.493687,0.539468,0.604455,0.998992,0.797409,0.303510,1.510213e-04,4.500000e+09,0.463734,...,0.314063,0.027951,0,0.000466,0.623620,0.840359,0.027064,0.566193,1,0.029890
6815,0,0.475162,0.538269,0.598308,0.998992,0.797414,0.303520,5.220000e+09,1.440000e+09,0.461978,...,0.314446,0.031470,0,0.001959,0.623931,0.840306,0.027009,0.566018,1,0.038284
6816,0,0.472725,0.533744,0.610444,0.998984,0.797401,0.303512,2.509312e-04,1.039086e-04,0.472189,...,0.313353,0.007542,0,0.002840,0.624156,0.840138,0.026791,0.565158,1,0.097649
6817,0,0.506264,0.559911,0.607850,0.999074,0.797500,0.303498,1.236154e-04,2.510000e+09,0.476123,...,0.320118,0.022916,0,0.002837,0.623957,0.841084,0.026822,0.565302,1,0.044009


In [8]:
X

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,Operating Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,Non-industry income and expenditure/revenue,Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,...,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Total assets to GNP price,No-credit Interval,Net Income to Stockholder's Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.370594,0.424389,0.601457,0.998969,0.796887,0.302646,1.256969e-04,0.000000e+00,0.458143,7.250725e-04,...,0.312905,0.118250,0,0.009219,0.622879,0.827890,0.026601,0.564050,1,0.016469
1,0.464291,0.538214,0.610235,0.998946,0.797380,0.303556,2.897851e-04,0.000000e+00,0.461867,6.470647e-04,...,0.314163,0.047775,0,0.008323,0.623652,0.839969,0.264577,0.570175,1,0.020794
2,0.426071,0.499019,0.601450,0.998857,0.796403,0.302035,2.361297e-04,2.550000e+07,0.458521,7.900790e-04,...,0.314515,0.025346,0,0.040003,0.623841,0.836774,0.026555,0.563706,1,0.016474
3,0.399844,0.451265,0.583541,0.998700,0.796967,0.303350,1.078888e-04,0.000000e+00,0.465705,4.490449e-04,...,0.302382,0.067250,0,0.003252,0.622929,0.834697,0.026697,0.564663,1,0.023982
4,0.465022,0.538432,0.598783,0.998973,0.797366,0.303475,7.890000e+09,0.000000e+00,0.462746,6.860686e-04,...,0.311567,0.047725,0,0.003878,0.623521,0.839973,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0.493687,0.539468,0.604455,0.998992,0.797409,0.303510,1.510213e-04,4.500000e+09,0.463734,1.790179e-04,...,0.314063,0.027951,0,0.000466,0.623620,0.840359,0.027064,0.566193,1,0.029890
6815,0.475162,0.538269,0.598308,0.998992,0.797414,0.303520,5.220000e+09,1.440000e+09,0.461978,2.370237e-04,...,0.314446,0.031470,0,0.001959,0.623931,0.840306,0.027009,0.566018,1,0.038284
6816,0.472725,0.533744,0.610444,0.998984,0.797401,0.303512,2.509312e-04,1.039086e-04,0.472189,0.000000e+00,...,0.313353,0.007542,0,0.002840,0.624156,0.840138,0.026791,0.565158,1,0.097649
6817,0.506264,0.559911,0.607850,0.999074,0.797500,0.303498,1.236154e-04,2.510000e+09,0.476123,2.110211e-04,...,0.320118,0.022916,0,0.002837,0.623957,0.841084,0.026822,0.565302,1,0.044009


In [9]:
y

0       1
1       1
2       1
3       1
4       1
       ..
6814    0
6815    0
6816    0
6817    0
6818    0
Name: Bankrupt?, Length: 6819, dtype: int64

Step 3: Create and train the DecisionTreeClassifier
Now, you can create an instance of the DecisionTreeClassifier and train it using your training data.

In [21]:
# # Create an instance of DecisionTreeClassifier
# clf_DTC = DecisionTreeClassifier()
# # Train the classifier using the training data
# clf_DTC.fit(X_train, y_train)

# # Create an instance of ExtraTreesClassifier
# clf_ETC = ExtraTreesClassifier()
# # Train the classifier using the training data
# clf_ETC.fit(X_train, y_train)

In [22]:
from model_scoring import scoring

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Create instances of DecisionTreeClassifier and ExtraTreesClassifier
DTC_model = DecisionTreeClassifier()
ETC_model = ExtraTreesClassifier()

DTC_model.fit(X_train, y_train)
ETC_model.fit(X_train, y_train)

# Make predictions on train and test sets
DTC_train_prediction = DTC_model.predict(X_train)
DTC_test_prediction = DTC_model.predict(X_test)

ETC_train_prediction = ETC_model.predict(X_train)
ETC_test_prediction = ETC_model.predict(X_test)

# Evaluation metrics for DecisionTreeClassifier
DTC_train_accuracy = accuracy_score(y_train, DTC_train_prediction)
DTC_train_precision = precision_score(y_train, DTC_train_prediction)
DTC_train_recall = recall_score(y_train, DTC_train_prediction)

DTC_test_accuracy = accuracy_score(y_test, DTC_test_prediction)
DTC_test_precision = precision_score(y_test, DTC_test_prediction)
DTC_test_recall = recall_score(y_test, DTC_test_prediction)


# Evaluation metrics for ExtraTreesClassifier
ETC_train_accuracy = accuracy_score(y_train, ETC_train_prediction)
ETC_train_precision = precision_score(y_train, ETC_train_prediction)
ETC_train_recall = recall_score(y_train, ETC_train_prediction)

ETC_test_accuracy = accuracy_score(y_test, ETC_test_prediction)
ETC_test_precision = precision_score(y_test, ETC_test_prediction)
ETC_test_recall = recall_score(y_test, ETC_test_prediction)



print("Decision Tree Classifier AKA DTC:")
print("***")
print("Train Accuracy:", DTC_train_accuracy)
print("Train Precision:", DTC_train_precision)
print("Train Recall:", DTC_train_recall)
print("***")
print("Test Accuracy:", DTC_test_accuracy)
print("Test Precision:", DTC_test_precision)
print("Test Recall:", DTC_test_recall)
print()
print()

print("Logistic Extra Trees Classifier AKA ETC:")
print("***")
print("Train Accuracy:", ETC_train_accuracy)
print("Train Precision:", ETC_train_precision)
print("Train Recall:", ETC_train_recall)
print("***")
print("Test Accuracy:", ETC_test_accuracy)
print("Test Precision:", ETC_test_precision)
print("Test Recall:", ETC_test_recall)

Decision Tree Classifier AKA DTC:
***
Train Accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0
***
Test Accuracy: 0.9479472140762464
Test Precision: 0.3
Test Recall: 0.29411764705882354


Logistic Extra Trees Classifier AKA ETC:
***
Train Accuracy: 1.0
Train Precision: 1.0
Train Recall: 1.0
***
Test Accuracy: 0.966275659824047
Test Precision: 1.0
Test Recall: 0.09803921568627451


Step 4: Make predictions
Once the classifier is trained, you can use it to make predictions on new, unseen data.

In [24]:
# Make predictions on the test data
y_pred_DTC = DTC_model.predict(X_test)
y_pred_ETC = ETC_model.predict(X_test)

In [25]:
y_pred_DTC

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
y_pred_ETC

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

#### REMINDER
#### ROC = RECEIVER OPERATING CHARACTERISTICS
#### AUC = AREA UNDER CURVE

Step 5: Evaluate the model
To evaluate the performance of your DecisionTreeClassifier, you can compare the predicted labels with the true labels from your test data.

In [27]:
# tree_model = DecisionTreeClassifier()

# tree_model.fit(X_train, y_train)

# # Make predictions on train and test sets
# tree_train_pred = tree_model.predict(X_train)
# tree_test_pred = tree_model.predict(X_test)

# # Evaluation metrics for decision tree classifier
# tree_train_accuracy = accuracy_score(y_train, tree_train_pred)
# tree_train_precision = precision_score(y_train, tree_train_pred)
# tree_train_recall = recall_score(y_train, tree_train_pred)

# tree_test_accuracy = accuracy_score(y_test, tree_test_pred)
# tree_test_precision = precision_score(y_test, tree_test_pred)
# tree_test_recall = recall_score(y_test, tree_test_pred)

# print("Decision Tree Classifier:")
# print("Train Accuracy:", tree_train_accuracy)
# print("Train Precision:", tree_train_precision)
# print("Train Recall:", tree_train_recall)
# print("Test Accuracy:", tree_test_accuracy)
# print("Test Precision:", tree_test_precision)
# print("Test Recall:", tree_test_recall)

Step 6: Adjust hyperparameters (optional)
Decision trees have various hyperparameters that can be tuned to optimize performance. You can experiment with different values for hyperparameters such as max_depth, min_samples_split, and min_samples_leaf to see if they improve the model's accuracy. You can also explore techniques like pruning or using a random forest ensemble.