In [53]:
# pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
taiwanese_bankruptcy_prediction = fetch_ucirepo(id=572) 
  
# data (as pandas dataframes) 
X = taiwanese_bankruptcy_prediction.data.features 
y = taiwanese_bankruptcy_prediction.data.targets 
  
# metadata 
print(taiwanese_bankruptcy_prediction.metadata) 
  
# variable information 
print(taiwanese_bankruptcy_prediction.variables) 


{'uci_id': 572, 'name': 'Taiwanese Bankruptcy Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/572/taiwanese+bankruptcy+prediction', 'data_url': 'https://archive.ics.uci.edu/static/public/572/data.csv', 'abstract': 'The data were collected from the Taiwan Economic Journal  for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange.', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 6819, 'num_features': 95, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Bankrupt?'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Mar 15 2024', 'dataset_doi': '10.24432/C5004D', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': None, 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': Non

In [54]:
y.value_counts()

Bankrupt?
0            6599
1             220
Name: count, dtype: int64

In [55]:
X.head()



Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,0.780985,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,0.781506,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,0.780284,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,0.781241,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,0.78155,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [56]:
# Import Module
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Count distinct resampled values
y_resampled.value_counts()

Bankrupt?
0            166
1            166
Name: count, dtype: int64

In [57]:


# Import `LogisticRegression` from sklearn
from sklearn.linear_model import LogisticRegression

# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model = LogisticRegression()

logistic_regression_model_undersampled = LogisticRegression()

# Fit the model
logistic_regression_model.fit(X_train, y_train)

logistic_regression_model_undersampled.fit(X_resampled, y_resampled)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


In [58]:
y_pred_resampled = logistic_regression_model_undersampled.predict(X_test)

In [59]:
# Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test, y_test)}")

Training Data Score: 0.9622604614782949
Testing Data Score: 0.9612903225806452


In [60]:
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score

# Make predictions on the test data
predictions = logistic_regression_model.predict(X)

# Create a confusion matrix
print(confusion_matrix(y, predictions, labels = [1,0]))

[[   5  215]
 [  44 6555]]


In [61]:
# Create a classification report
print(classification_report(y, predictions, labels = [1, 0]))

              precision    recall  f1-score   support

           1       0.10      0.02      0.04       220
           0       0.97      0.99      0.98      6599

    accuracy                           0.96      6819
   macro avg       0.54      0.51      0.51      6819
weighted avg       0.94      0.96      0.95      6819



In [62]:
print(classification_report(y_test, y_pred_resampled))

              precision    recall  f1-score   support

           0       0.98      0.72      0.83      1651
           1       0.05      0.46      0.09        54

    accuracy                           0.71      1705
   macro avg       0.51      0.59      0.46      1705
weighted avg       0.95      0.71      0.80      1705



In [63]:
# Calculate the balanced accuracy score
print(balanced_accuracy_score(y, predictions))

0.5080297979032636


In [64]:
# 1- Run each selected model with the data as is to see qhich one has the highest accuracy (make  a Pipeline to re-do each model and analyze it)
# 2- Once you choose a model, run oversampling and undersampling  because the data set is unblanced
# 3- Explain the confusion matrix for each


# Logistic Regression (Ghalia)
# SVM                 (Andrew)
# Random Forest ***    (Parra)
# KNN                  (Syed)
#Decision Trees
