<a href="https://colab.research.google.com/github/orimejia/Proyecto-AI/blob/main/03_Preprocesado_de_los_datos_ENTREGAFINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectFromModel, mutual_info_classif, f_classif, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
url = 'https://raw.githubusercontent.com/HernanAC/TaiwaneseBankruptcyPrediction/main/simulated_data.csv'
data = pd.read_csv(url)

# **03 - Preprocesado de los datos**


In [3]:
data.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,MEDIUM,MEDIUM,MEDIUM,,0.601457,0.998969,0.796887,0.808809,LOW,...,HIGH,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,MEDIUM,MEDIUM,MEDIUM,0.610235,0.610235,0.998946,0.79738,0.809301,LOW,...,HIGH,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,MEDIUM,MEDIUM,MEDIUM,0.60145,0.601364,0.998857,0.796403,0.808388,LOW,...,HIGH,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,MEDIUM,MEDIUM,MEDIUM,0.583541,0.583541,0.9987,0.796967,0.808966,LOW,...,HIGH,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,MEDIUM,MEDIUM,MEDIUM,0.598783,0.598783,0.998973,0.797366,0.809304,LOW,...,HIGH,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


## Total de datos nulos en el dataset

In [4]:
data.isnull().sum()

Bankrupt?                                                      0
 ROA(C) before interest and depreciation before interest       0
 ROA(A) before interest and % after tax                        0
 ROA(B) before interest and depreciation after tax             0
 Operating Gross Margin                                     1364
                                                            ... 
 Liability to Equity                                           0
 Degree of Financial Leverage (DFL)                            0
 Interest Coverage Ratio (Interest expense to EBIT)            0
 Net Income Flag                                               0
 Equity to Liability                                           0
Length: 96, dtype: int64

In [5]:
nulls = data.isna().sum()
nulls [nulls!=0]

 Operating Gross Margin         1364
 Cash Turnover Rate             1364
 Current Liability to Equity    1364
dtype: int64

## Llenando los datos nulos con el promedio

In [6]:
data[' Operating Gross Margin'] = data[' Operating Gross Margin'].fillna(data[' Operating Gross Margin'].mean())
data[' Cash Turnover Rate'] = data[' Cash Turnover Rate'].fillna(data[' Cash Turnover Rate'].mean())
data[' Current Liability to Equity'] = data[' Current Liability to Equity'].fillna(data[' Current Liability to Equity'].mean())

In [7]:
print(data.isnull().sum().sum())

0


## One hot encoding para columnas categóricas

In [8]:
categorical_features2 = data.dtypes[data.dtypes == 'object'].index
data[categorical_features2].columns.tolist()

[' ROA(C) before interest and depreciation before interest',
 ' ROA(A) before interest and % after tax',
 ' ROA(B) before interest and depreciation after tax',
 ' Non-industry income and expenditure/revenue',
 ' Inventory Turnover Rate (times)',
 ' Working Capital/Equity',
 ' Cash Flow to Sales',
 ' Net Income to Total Assets']

In [9]:
df1 = pd.get_dummies(data = data, columns = [' ROA(C) before interest and depreciation before interest'], prefix = "is_ROAC")
df2 = pd.get_dummies(data = data, columns = [' ROA(A) before interest and % after tax'], prefix = "is_ROAA")
df3 = pd.get_dummies(data = data, columns = [' ROA(B) before interest and depreciation after tax'], prefix = "is_ROAB")
df4 = pd.get_dummies(data = data, columns = [' Non-industry income and expenditure/revenue'], prefix = "is_NIIER")
df5 = pd.get_dummies(data = data, columns = [' Inventory Turnover Rate (times)'], prefix = "is_ITR")
df6 = pd.get_dummies(data = data, columns = [' Working Capital/Equity'], prefix = "is_WCE")
df7 = pd.get_dummies(data = data, columns = [' Cash Flow to Sales'], prefix = "is_CFS")
df8 = pd.get_dummies(data = data, columns = [' Net Income to Total Assets'], prefix = "is_NITA")

dfs1 = pd.merge(df1,df2)
dfs2 = pd.merge(df3,df4)
dfs3 = pd.merge(df5,df6)
dfs4 = pd.merge(df7,df8)
dfs5 = pd.merge(dfs1,dfs2)
dfs6 = pd.merge(dfs3,dfs4)
dfsf = pd.merge(dfs5,dfs6)
data = dfsf
print(data.columns)

Index(['Bankrupt?', ' ROA(A) before interest and % after tax',
       ' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Realized Sales Gross Margin',
       ' Operating Profit Rate', ' Pre-tax net Interest Rate',
       ' After-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Continuous interest rate (after tax)',
       ...
       'is_ITR_MEDIUM', 'is_WCE_HIGH', 'is_WCE_LOW', 'is_WCE_MEDIUM',
       'is_CFS_HIGH', 'is_CFS_LOW', 'is_CFS_MEDIUM', 'is_NITA_HIGH',
       'is_NITA_LOW', 'is_NITA_MEDIUM'],
      dtype='object', length=120)


In [10]:
data = data.drop(columns=[' ROA(C) before interest and depreciation before interest',
                   ' ROA(A) before interest and % after tax', 
                   ' ROA(B) before interest and depreciation after tax',
                   ' Non-industry income and expenditure/revenue',
                   ' Inventory Turnover Rate (times)',
                   ' Working Capital/Equity',
                   ' Cash Flow to Sales',
                   ' Net Income to Total Assets'])

In [11]:
categorical_features2 = data.dtypes[data.dtypes == 'object'].index
data[categorical_features2].columns.tolist()

[]

In [12]:
data.dtypes

Bankrupt?                         int64
 Operating Gross Margin         float64
 Realized Sales Gross Margin    float64
 Operating Profit Rate          float64
 Pre-tax net Interest Rate      float64
                                 ...   
is_CFS_LOW                        uint8
is_CFS_MEDIUM                     uint8
is_NITA_HIGH                      uint8
is_NITA_LOW                       uint8
is_NITA_MEDIUM                    uint8
Length: 112, dtype: object

In [13]:
data.shape

(6819, 112)

In [15]:
data

Unnamed: 0,Bankrupt?,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,...,is_ITR_MEDIUM,is_WCE_HIGH,is_WCE_LOW,is_WCE_MEDIUM,is_CFS_HIGH,is_CFS_LOW,is_CFS_MEDIUM,is_NITA_HIGH,is_NITA_LOW,is_NITA_MEDIUM
0,1,0.608066,0.601457,0.998969,0.796887,0.808809,0.780985,1.256969e-04,0.000000e+00,0.458143,...,0,1,0,0,1,0,0,1,0,0
1,1,0.610235,0.610235,0.998946,0.797380,0.809301,0.781506,2.897851e-04,0.000000e+00,0.461867,...,0,1,0,0,1,0,0,1,0,0
2,1,0.601450,0.601364,0.998857,0.796403,0.808388,0.780284,2.361297e-04,2.550000e+07,0.458521,...,0,1,0,0,1,0,0,1,0,0
3,1,0.583541,0.583541,0.998700,0.796967,0.808966,0.781241,1.078888e-04,0.000000e+00,0.465705,...,0,1,0,0,1,0,0,1,0,0
4,1,0.598783,0.598783,0.998973,0.797366,0.809304,0.781550,7.890000e+09,0.000000e+00,0.462746,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.604455,0.604462,0.998992,0.797409,0.809331,0.781588,1.510213e-04,4.500000e+09,0.463734,...,0,1,0,0,1,0,0,1,0,0
6815,0,0.608066,0.598308,0.998992,0.797414,0.809327,0.781586,5.220000e+09,1.440000e+09,0.461978,...,0,1,0,0,1,0,0,1,0,0
6816,0,0.610444,0.610213,0.998984,0.797401,0.809317,0.781546,2.509312e-04,1.039086e-04,0.472189,...,0,1,0,0,1,0,0,1,0,0
6817,0,0.607850,0.607850,0.999074,0.797500,0.809399,0.781663,1.236154e-04,2.510000e+09,0.476123,...,0,1,0,0,1,0,0,1,0,0


In [16]:
data.to_csv('processed_data.csv', index=False)