In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier


In [2]:
df = pd.read_csv('train_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23999 entries, 0 to 23998
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          23999 non-null  int64
 1   LIMIT_BAL                   23999 non-null  int64
 2   SEX                         23999 non-null  int64
 3   EDUCATION                   23999 non-null  int64
 4   MARRIAGE                    23999 non-null  int64
 5   AGE                         23999 non-null  int64
 6   PAY_0                       23999 non-null  int64
 7   PAY_2                       23999 non-null  int64
 8   PAY_3                       23999 non-null  int64
 9   PAY_4                       23999 non-null  int64
 10  PAY_5                       23999 non-null  int64
 11  PAY_6                       23999 non-null  int64
 12  BILL_AMT1                   23999 non-null  int64
 13  BILL_AMT2                   23999 non-null  int64
 14  BILL_A

In [4]:
SEX={1:'male', 2:'female'}

In [5]:
df['SEX'] = df.SEX.replace(SEX)

In [6]:
df.SEX.value_counts()

female    14486
male       9513
Name: SEX, dtype: int64

In [7]:
EDUCATION={1:'graduate school', 2:'university', 3:'high school', 4:'others', 5:'unknown', 6:'unknown'}
df['EDUCATION'] = df.EDUCATION.replace(EDUCATION)
df.EDUCATION.value_counts()

university         11188
graduate school     8442
high school         3994
unknown              264
others               100
0                     11
Name: EDUCATION, dtype: int64

In [8]:
MARRIAGE={1:'married', 2:'single', 3:'others'}
df['MARRIAGE'] = df.MARRIAGE.replace(MARRIAGE)
df.MARRIAGE.value_counts()

single     12735
married    10958
others       263
0             43
Name: MARRIAGE, dtype: int64

In [9]:
df = df[df['MARRIAGE']!=0]
df = df[df['SEX']!=0]
df = df[df['EDUCATION']!=0]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23945 entries, 0 to 23998
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ID                          23945 non-null  int64 
 1   LIMIT_BAL                   23945 non-null  int64 
 2   SEX                         23945 non-null  object
 3   EDUCATION                   23945 non-null  object
 4   MARRIAGE                    23945 non-null  object
 5   AGE                         23945 non-null  int64 
 6   PAY_0                       23945 non-null  int64 
 7   PAY_2                       23945 non-null  int64 
 8   PAY_3                       23945 non-null  int64 
 9   PAY_4                       23945 non-null  int64 
 10  PAY_5                       23945 non-null  int64 
 11  PAY_6                       23945 non-null  int64 
 12  BILL_AMT1                   23945 non-null  int64 
 13  BILL_AMT2                   23945 non-null  in

In [10]:
df['PAY_0'] = df['PAY_0'].map(str)
df['PAY_2'] = df['PAY_2'].map(str)
df['PAY_3'] = df['PAY_3'].map(str)
df['PAY_4'] = df['PAY_4'].map(str)
df['PAY_5'] = df['PAY_5'].map(str)
df['PAY_6'] = df['PAY_6'].map(str)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23945 entries, 0 to 23998
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ID                          23945 non-null  int64 
 1   LIMIT_BAL                   23945 non-null  int64 
 2   SEX                         23945 non-null  object
 3   EDUCATION                   23945 non-null  object
 4   MARRIAGE                    23945 non-null  object
 5   AGE                         23945 non-null  int64 
 6   PAY_0                       23945 non-null  object
 7   PAY_2                       23945 non-null  object
 8   PAY_3                       23945 non-null  object
 9   PAY_4                       23945 non-null  object
 10  PAY_5                       23945 non-null  object
 11  PAY_6                       23945 non-null  object
 12  BILL_AMT1                   23945 non-null  int64 
 13  BILL_AMT2                   23945 non-null  in

In [12]:
X = df.drop(axis=1, labels=['ID', 'default payment next month'])

In [13]:
y =df['default payment next month']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
ohe_features = ['SEX', 'EDUCATION', 'MARRIAGE', 
                'PAY_0', 'PAY_2', 'PAY_3', 
                'PAY_4', 'PAY_5', 'PAY_6']

cont_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1',
                 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
                 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
                 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
                 'PAY_AMT5', 'PAY_AMT6']

X_train_ohe = X_train[ohe_features]
X_test_ohe = X_test[ohe_features]
X_train_cont = X_train[cont_features].astype(float)
X_test_cont = X_test[cont_features].astype(float)

X_train_index = X_train.index
X_test_index = X_test.index

ohe = OneHotEncoder(handle_unknown='ignore')
ss = StandardScaler()
X_train_encoded = ohe.fit_transform(X_train_ohe)
X_test_encoded = ohe.transform(X_test_ohe)
X_train_scaled = pd.DataFrame(ss.fit_transform(X_train_cont), columns=X_train[cont_features].columns, index=X_train_index)
X_test_scaled = pd.DataFrame(ss.transform(X_test_cont), columns=X_test[cont_features].columns, index=X_test_index)

train_columns = ohe.get_feature_names(input_features=X_train_ohe.columns)
test_columns = ohe.get_feature_names(input_features=X_test_ohe.columns)
X_train_processed = pd.DataFrame(X_train_encoded.todense(), columns=train_columns, index=X_train_index)
X_test_processed = pd.DataFrame(X_test_encoded.todense(), columns=test_columns, index=X_test_index)

X_train_all = pd.concat([X_train_scaled, X_train_processed], axis=1)
X_test_all = pd.concat([X_test_scaled, X_test_processed], axis=1)

In [16]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_all, y_train)

RandomForestClassifier(n_estimators=500)

In [17]:
y_hat_train = rf.predict(X_train_all)
y_hat_test = rf.predict(X_test_all)

In [18]:
f1_score(y_test, y_hat_test)

0.4664750957854406

In [19]:
f1_score(y_train, y_hat_train)

0.9992443324937028

### Check for class imbalance

In [20]:
df['default payment next month'].value_counts()

0    18640
1     5305
Name: default payment next month, dtype: int64

In [21]:
from imblearn.over_sampling import SMOTE

In [22]:
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_sample(X_train_all, y_train)

In [24]:
rf_2 = RandomForestClassifier(n_estimators=500)
rf_2.fit(X_smote, y_smote)

RandomForestClassifier(n_estimators=500)

In [25]:
y_hat_train = rf_2.predict(X_smote)
y_hat_test = rf_2.predict(X_test_all)

In [26]:
f1_score(y_test, y_hat_test)

0.5117982099267697

In [27]:
f1_score(y_smote, y_hat_train)

0.9997854691075515