In [36]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt

## Reading the data file
credit_card = pd.read_excel('default of credit card clients.xls', skiprows = 1)
credit_card.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [37]:
## Splitting the data into train and test
train = credit_card.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
test = credit_card[~np.isin(credit_card['ID'], train['ID'])]

## Dropping ID 
train = train.drop(columns = 'ID', axis = 1)
test = test.drop(columns = 'ID', axis = 1)

test_target = test['default payment next month']
test = test.drop(columns = 'default payment next month', axis = 1)

## Writing the data to csv 
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)
test_target.to_csv('test_target.csv', index = False)

## Answering Questions

In [1]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

## Reading data files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Chaning labels
train['SEX'] = np.where(train['SEX'] == 1, 'male', 'female')
train['EDUCATION'] = np.where(train['EDUCATION'] == 1, 'graduate_school', 
                              np.where(train['EDUCATION'] == 2, 'university',
                                       np.where(train['EDUCATION'] == 3), 'high_school', 'other_unknown'))
train['MARRIAGE'] = np.where(train['MARRIAGE'] == 1, 'married', 
                             np.where(train['MARRIAGE'] == 2, 'single', 'other'))

## Creating ID columns
train['ID'] = list(range(1, train.shape[0] + 1))

## Splitting the data into train and test
training = train.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
testing = train[~np.isin(train['ID'], training['ID'])]

## Dropping ID
training = training.drop(columns = 'ID', axis = 1)
testing = testing.drop(columns = 'ID', axis = 1)

In [2]:
training.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
16540,150000,female,2,3,41,1,-2,-2,-1,-1,-1,0,0,0,3355,962,821,0,0,4026,962,821,380,0
3497,360000,female,1,2,27,0,0,0,0,0,0,119138,100553,96679,79703,71682,24230,20006,25006,20200,40000,0,0,0
4625,200000,female,1,2,29,1,-1,-1,-1,-1,-1,55788,8360,4897,5065,19468,0,8394,4927,5089,19561,0,125,0
7197,230000,female,1,2,24,1,2,0,0,2,0,20887,18705,17295,18335,9320,1858,0,1200,3000,0,254,1637,0
18127,50000,male,5,1,30,0,0,0,0,0,0,47311,44136,17865,6431,6939,4509,2900,1035,500,508,410,0,0


In [3]:
train['EDUCATION']

0        1
1        2
2        2
3        2
4        1
        ..
23995    2
23996    1
23997    2
23998    1
23999    3
Name: EDUCATION, Length: 24000, dtype: int64

In [5]:
## Chaning labels


## Changing labels to dummies 
training = pd.concat([training.drop(columns = 'SEX', axis = 1), pd.get_dummies(training['SEX'])], axis = 1)
training.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,1,2
802,20000,3,1,51,2,0,0,0,0,0,14815,15837,16852,17188,17546,17895,1264,1279,615,635,639,664,0,1,0
9908,50000,2,1,45,0,0,0,0,0,0,49073,12630,9215,9215,8011,5615,2022,2000,0,1000,1904,1664,0,0,1
483,20000,2,2,36,0,0,2,2,3,2,11423,13976,13455,15227,14699,14313,2749,0,2000,0,0,1230,0,1,0
2566,160000,1,1,53,0,0,0,0,0,0,52607,53548,56214,57246,58443,59883,2834,3500,2500,2207,2500,2300,0,0,1
11838,70000,2,1,34,0,0,0,0,0,0,50488,47018,46871,41821,41344,41376,2200,2015,2000,1500,1700,2000,0,0,1


In [None]:
## Strong heredity