In [36]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt

## Reading the data file
credit_card = pd.read_excel('default of credit card clients.xls', skiprows = 1)
credit_card.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [37]:
## Splitting the data into train and test
train = credit_card.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
test = credit_card[~np.isin(credit_card['ID'], train['ID'])]

## Dropping ID 
train = train.drop(columns = 'ID', axis = 1)
test = test.drop(columns = 'ID', axis = 1)

test_target = test['default payment next month']
test = test.drop(columns = 'default payment next month', axis = 1)

## Writing the data to csv 
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)
test_target.to_csv('test_target.csv', index = False)

## Answering Questions

In [1]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

## Reading data files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# ## Chaning labels
train['SEX'] = np.where(train['SEX'] == 1, 'male', 'female')
train['EDUCATION'] = np.where(train['EDUCATION'] == 1, 'graduate_school', 
                              np.where(train['EDUCATION'] == 2, 'university',
                                       np.where(train['EDUCATION'] == 3, 'high_school', 'other_unknown')))
train['MARRIAGE'] = np.where(train['MARRIAGE'] == 1, 'married', 
                             np.where(train['MARRIAGE'] == 2, 'single', 'other'))


test['SEX'] = np.where(test['SEX'] == 1, 'male', 'female')
test['EDUCATION'] = np.where(test['EDUCATION'] == 1, 'graduate_school', 
                              np.where(test['EDUCATION'] == 2, 'university',
                                       np.where(test['EDUCATION'] == 3, 'high_school', 'other_unknown')))
test['MARRIAGE'] = np.where(test['MARRIAGE'] == 1, 'married', 
                             np.where(test['MARRIAGE'] == 2, 'single', 'other'))

## Creating ID columns
train['ID'] = list(range(1, train.shape[0] + 1))

## Splitting the data into train and test
training = train.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
testing = train[~np.isin(train['ID'], training['ID'])]

## Dropping ID
training = training.drop(columns = 'ID', axis = 1)
testing = testing.drop(columns = 'ID', axis = 1)

In [2]:
X = train.drop(columns = 'default payment next month', axis = 1)
Y = train['default payment next month']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [3]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,ID
16417,80000,male,graduate_school,single,40,-1,-1,0,0,0,0,1068,37102,38112,40079,41040,42994,37200,2000,3000,2000,3000,2000,16418
2934,250000,male,university,single,26,0,0,0,0,0,0,115497,114716,117212,112623,110178,108176,3835,4068,3859,3597,3822,3825,2935
1834,30000,male,university,married,45,-1,-1,-1,-1,-1,-1,390,390,390,390,0,780,390,390,390,0,780,0,1835
23311,20000,female,university,single,25,0,0,0,0,0,0,15389,16397,17195,17499,16236,10400,1270,1341,1160,556,600,0,23312
21399,50000,female,high_school,other,53,0,0,0,0,0,0,49326,50302,50036,48816,19587,18406,1803,2100,1500,1000,700,600,21400


In [4]:
X_train['PAY_6'].value_counts() / X_train.shape[0]

 0    0.541927
-1    0.192396
-2    0.164323
 2    0.091094
 3    0.005781
 4    0.001823
 7    0.001510
 6    0.000677
 5    0.000417
 8    0.000052
Name: PAY_6, dtype: float64

In [41]:
X_test['PAY_6'].value_counts() / X_test.shape[0]

 0    0.552708
-1    0.191458
-2    0.161042
 2    0.083542
 3    0.006250
 7    0.002083
 4    0.001875
 5    0.000833
 6    0.000208
Name: PAY_6, dtype: float64

In [5]:
training['SEX'].value_counts()

female    11667
male       7533
Name: SEX, dtype: int64

In [14]:
training.head(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
16864,20000,female,graduate_school,single,24,-1,-1,-1,-1,-1,0,4687,9060,5064,14745,6805,6805,15000,6004,14776,10000,0,7436,0
5177,50000,female,high_school,married,58,0,0,0,0,0,0,50019,18458,19318,15949,14986,19401,1400,1229,687,2000,5000,1000,0
1203,20000,male,university,single,54,0,0,0,0,0,0,37720,12266,8999,8810,10167,11710,1200,1200,1600,1500,1700,1000,0
17522,230000,female,graduate_school,married,46,-2,-2,-2,-2,-2,-2,4957,9463,7697,5308,2048,6283,9467,7697,5308,2048,6283,18157,0
11631,100000,male,graduate_school,single,32,0,0,0,0,0,0,26650,26127,26608,24689,25699,26809,2000,2000,2000,1700,1500,1500,0
14136,50000,female,university,single,48,1,4,3,2,2,2,10024,9723,9427,9075,10037,12503,0,0,0,1000,0,264,0
1743,200000,female,graduate_school,married,39,-1,-1,-1,-1,0,0,11399,395,5094,9204,9403,4602,400,9500,9250,5000,0,5000,0
10309,290000,male,university,married,38,-1,-1,-1,-1,-1,0,3137,7999,2948,-6097,11037,17834,8042,2980,2777,17941,10002,10000,0
3800,170000,female,graduate_school,single,39,0,0,0,0,0,-1,60514,33711,21660,23307,15135,3527,5011,5000,6538,4167,3527,7521,0
12365,110000,male,university,single,29,0,0,0,0,0,0,31327,31495,27682,28418,29442,30213,2000,1800,1500,1500,1400,5002,0


In [9]:
training['PAY_0'].value_counts() / training.shape[0]

 0    0.488281
-1    0.193021
 1    0.121302
-2    0.093281
 2    0.088854
 3    0.010729
 4    0.002344
 5    0.000937
 8    0.000573
 6    0.000365
 7    0.000313
Name: PAY_0, dtype: float64

In [10]:
testing['PAY_0'].value_counts() / testing.shape[0]

 0    0.498542
-1    0.181250
 1    0.124167
-2    0.094167
 2    0.088542
 3    0.009375
 4    0.002708
 8    0.000625
 5    0.000417
 6    0.000208
Name: PAY_0, dtype: float64

In [13]:
test['PAY_0'].value_counts() / test.shape[0]

 0    0.494833
-1    0.185000
 1    0.127167
 2    0.089333
-2    0.086000
 3    0.011833
 4    0.003000
 5    0.001000
 8    0.000833
 7    0.000500
 6    0.000500
Name: PAY_0, dtype: float64

In [7]:
testing.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,400000,male,graduate_school,single,32,0,0,0,0,0,0,55773,55917,51389,48272,49478,51242,3028,3023,3000,3000,3000,38662,0
3,280000,female,university,married,27,0,0,0,0,0,0,280913,283222,273160,257689,193231,191143,11052,9563,15017,5374,5420,6021,0
15,60000,female,graduate_school,single,24,0,0,0,0,0,0,57802,58960,60862,59279,59176,58128,2960,2866,2279,2276,2128,2237,0
27,230000,female,graduate_school,single,25,0,0,0,0,0,-2,32915,21726,12299,5860,0,0,4794,3116,5029,0,0,0,0
38,200000,female,graduate_school,single,28,0,0,0,0,0,0,52935,51971,51341,49090,50009,51076,2500,2200,2500,3500,3000,3000,0


In [12]:
training['SEX'].value_counts() / training.shape[0]

2    0.606094
1    0.393906
Name: SEX, dtype: float64

In [14]:
testing['SEX'].value_counts() / testing.shape[0]

2    0.609167
1    0.390833
Name: SEX, dtype: float64

In [16]:
training['EDUCATION'].value_counts() / training.shape[0]

2    0.470729
1    0.349844
3    0.163854
5    0.009479
4    0.003750
6    0.001719
0    0.000625
Name: EDUCATION, dtype: float64

In [17]:
testing['EDUCATION'].value_counts() / testing.shape[0]

2    0.466042
1    0.359583
3    0.161042
5    0.008542
4    0.003542
6    0.001042
0    0.000208
Name: EDUCATION, dtype: float64

In [18]:
training['MARRIAGE'].value_counts() / training.shape[0]

2    0.527760
1    0.459375
3    0.011146
0    0.001719
Name: MARRIAGE, dtype: float64

In [19]:
testing['MARRIAGE'].value_counts() / testing.shape[0]

2    0.542500
1    0.445208
3    0.010625
0    0.001667
Name: MARRIAGE, dtype: float64

In [21]:
training['PAY_2'].value_counts() / training.shape[0]

 0    0.525104
-1    0.202396
 2    0.131823
-2    0.123802
 3    0.011198
 4    0.003385
 1    0.000885
 7    0.000625
 5    0.000469
 6    0.000313
Name: PAY_2, dtype: float64

In [22]:
testing['PAY_2'].value_counts() / testing.shape[0]

 0    0.518125
-1    0.206042
-2    0.138542
 2    0.122708
 3    0.009167
 4    0.002083
 5    0.001667
 1    0.000625
 7    0.000417
 6    0.000417
 8    0.000208
Name: PAY_2, dtype: float64

In [24]:
train['PAY_0'].value_counts()

 0    11768
-1     4576
 1     2925
-2     2243
 2     2131
 3      251
 4       58
 5       20
 8       14
 6        8
 7        6
Name: PAY_0, dtype: int64

In [5]:
## Chaning labels


## Changing labels to dummies 
training = pd.concat([training.drop(columns = 'SEX', axis = 1), pd.get_dummies(training['SEX'])], axis = 1)
training.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,1,2
802,20000,3,1,51,2,0,0,0,0,0,14815,15837,16852,17188,17546,17895,1264,1279,615,635,639,664,0,1,0
9908,50000,2,1,45,0,0,0,0,0,0,49073,12630,9215,9215,8011,5615,2022,2000,0,1000,1904,1664,0,0,1
483,20000,2,2,36,0,0,2,2,3,2,11423,13976,13455,15227,14699,14313,2749,0,2000,0,0,1230,0,1,0
2566,160000,1,1,53,0,0,0,0,0,0,52607,53548,56214,57246,58443,59883,2834,3500,2500,2207,2500,2300,0,0,1
11838,70000,2,1,34,0,0,0,0,0,0,50488,47018,46871,41821,41344,41376,2200,2015,2000,1500,1700,2000,0,0,1


In [None]:
## Strong heredity