## UCIMLR: default of credit card clients Data Set

source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)

### Train/test split

In [2]:
df = pd.read_excel("Data\\data_original\\default of credit card clients.xls", header=1)

df["GOOD"] = 1 - df["default payment next month"]
df['SEX'] = df['SEX'] - 1
df = df.drop(["ID", "default payment next month"], axis=1)

df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,GOOD
0,20000,1,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,0
1,120000,1,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,0
2,90000,1,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,1
3,50000,1,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,1
4,50000,0,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,1


In [3]:
X = df.drop(['GOOD'], axis=1)
y = df['GOOD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42, stratify=y)

X_train.to_csv("Data\\data_preprocessed\\taiwan_data\\X_train.csv", index=False)
X_test.to_csv("Data\\data_preprocessed\\taiwan_data\\X_test.csv", index=False)
y_train.to_csv("Data\\data_preprocessed\\taiwan_data\\y_train.csv", index=False)
y_test.to_csv("Data\\data_preprocessed\\taiwan_data\\y_test.csv", index=False)

### Train/test split with binning

In [4]:
df_binned = df.copy()

df_binned['LIMIT_BAL'] = pd.qcut(df['LIMIT_BAL'], 10, labels=range(1, 11))
df_binned['AGE'] = pd.qcut(df_binned['AGE'], 10, labels=range(1, 11))
df_binned['BILL_AMT1'] = pd.qcut(df_binned['BILL_AMT1'], 10, labels=range(1, 11))
df_binned['BILL_AMT2'] = pd.qcut(df_binned['BILL_AMT2'], 10, labels=range(1, 11))
df_binned['BILL_AMT3'] = pd.qcut(df_binned['BILL_AMT3'], 10, labels=range(1, 11))
df_binned['BILL_AMT4'] = pd.qcut(df_binned['BILL_AMT4'], 10, labels=range(1, 11))
df_binned['BILL_AMT5'] = pd.qcut(df_binned['BILL_AMT5'], 10, labels=range(1, 11))
df_binned['BILL_AMT6'] = pd.qcut(df_binned['BILL_AMT6'], 10, labels=range(1, 11))
df_binned['PAY_AMT1'] = pd.qcut(df_binned['PAY_AMT1'], 5, labels=range(1, 6))
df_binned['PAY_AMT2'] = pd.qcut(df_binned['PAY_AMT2'], 5, labels=range(1, 6))
df_binned['PAY_AMT3'] = pd.qcut(df_binned['PAY_AMT3'], 5, labels=range(1, 6))
df_binned['PAY_AMT4'] = pd.qcut(df_binned['PAY_AMT4'], 4, labels=range(1, 5))
df_binned['PAY_AMT5'] = pd.qcut(df_binned['PAY_AMT5'], 4, labels=range(1, 5))
df_binned['PAY_AMT6'] = pd.qcut(df_binned['PAY_AMT6'], 4, labels=range(1, 5))

df_binned.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,GOOD
0,1,1,2,1,1,2,2,-1,-1,-2,-2,3,3,2,1,1,1,1,2,1,1,1,1,0
1,5,1,2,2,2,-1,2,0,0,0,2,3,3,3,3,3,4,1,2,2,2,1,3,0
2,4,1,2,2,5,0,0,0,0,0,0,6,5,5,5,5,5,2,2,2,2,2,4,1
3,2,1,2,1,6,0,0,0,0,0,0,7,7,8,6,7,7,3,3,2,2,2,2,1
4,2,0,2,1,10,-1,0,-1,0,0,0,4,4,7,6,6,6,3,5,5,4,2,2,1


In [5]:
X = df_binned.drop(['GOOD'], axis=1)
y = df_binned['GOOD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42, stratify=y)

X_train.to_csv("Data\\data_preprocessed\\taiwan_data_binned\\X_train.csv", index=False)
X_test.to_csv("Data\\data_preprocessed\\taiwan_data_binned\\X_test.csv", index=False)
y_train.to_csv("Data\\data_preprocessed\\taiwan_data_binned\\y_train.csv", index=False)
y_test.to_csv("Data\\data_preprocessed\\taiwan_data_binned\\y_test.csv", index=False)