In [36]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt

## Reading the data file
credit_card = pd.read_excel('default of credit card clients.xls', skiprows = 1)
credit_card.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [37]:
## Splitting the data into train and test
train = credit_card.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
test = credit_card[~np.isin(credit_card['ID'], train['ID'])]

## Dropping ID 
train = train.drop(columns = 'ID', axis = 1)
test = test.drop(columns = 'ID', axis = 1)

test_target = test['default payment next month']
test = test.drop(columns = 'default payment next month', axis = 1)

## Writing the data to csv 
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)
test_target.to_csv('test_target.csv', index = False)

## Answering Questions

In [1]:
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

## Reading data files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# ## Chaning labels
train['SEX'] = np.where(train['SEX'] == 1, 'male', 'female')
train['EDUCATION'] = np.where(train['EDUCATION'] == 1, 'graduate_school', 
                              np.where(train['EDUCATION'] == 2, 'university',
                                       np.where(train['EDUCATION'] == 3, 'high_school', 'other_unknown')))
train['MARRIAGE'] = np.where(train['MARRIAGE'] == 1, 'married', 
                             np.where(train['MARRIAGE'] == 2, 'single', 'other'))


test['SEX'] = np.where(test['SEX'] == 1, 'male', 'female')
test['EDUCATION'] = np.where(test['EDUCATION'] == 1, 'graduate_school', 
                              np.where(test['EDUCATION'] == 2, 'university',
                                       np.where(test['EDUCATION'] == 3, 'high_school', 'other_unknown')))
test['MARRIAGE'] = np.where(test['MARRIAGE'] == 1, 'married', 
                             np.where(test['MARRIAGE'] == 2, 'single', 'other'))

## Creating ID columns
train['ID'] = list(range(1, train.shape[0] + 1))

## Splitting the data into train and test
training = train.groupby('default payment next month', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
testing = train[~np.isin(train['ID'], training['ID'])]

## Dropping ID
training = training.drop(columns = 'ID', axis = 1)
testing = testing.drop(columns = 'ID', axis = 1)

In [2]:
X = train.drop(columns = 'default payment next month', axis = 1)
Y = train['default payment next month']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [3]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,ID
16417,80000,male,graduate_school,single,40,-1,-1,0,0,0,0,1068,37102,38112,40079,41040,42994,37200,2000,3000,2000,3000,2000,16418
2934,250000,male,university,single,26,0,0,0,0,0,0,115497,114716,117212,112623,110178,108176,3835,4068,3859,3597,3822,3825,2935
1834,30000,male,university,married,45,-1,-1,-1,-1,-1,-1,390,390,390,390,0,780,390,390,390,0,780,0,1835
23311,20000,female,university,single,25,0,0,0,0,0,0,15389,16397,17195,17499,16236,10400,1270,1341,1160,556,600,0,23312
21399,50000,female,high_school,other,53,0,0,0,0,0,0,49326,50302,50036,48816,19587,18406,1803,2100,1500,1000,700,600,21400


In [4]:
X_train['PAY_6'].value_counts() / X_train.shape[0]

 0    0.541927
-1    0.192396
-2    0.164323
 2    0.091094
 3    0.005781
 4    0.001823
 7    0.001510
 6    0.000677
 5    0.000417
 8    0.000052
Name: PAY_6, dtype: float64

In [5]:
X_test['PAY_6'].value_counts() / X_test.shape[0]

 0    0.542083
-1    0.188333
-2    0.165417
 2    0.092917
 3    0.005833
 4    0.002292
 7    0.001458
 6    0.001042
 5    0.000417
 8    0.000208
Name: PAY_6, dtype: float64

In [6]:
training['SEX'].value_counts()

female    11683
male       7517
Name: SEX, dtype: int64

In [7]:
training.head(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
5074,80000,female,university,married,45,-1,-1,-1,-1,-1,-1,390,390,390,390,390,390,390,390,390,390,390,390,0
3012,20000,male,university,single,22,1,-2,-2,-1,0,0,-50,-440,-830,780,780,0,0,0,2000,0,0,0,0
15890,100000,male,university,single,38,0,0,0,0,0,0,95933,96772,97240,98159,97481,97574,4499,4578,4606,3817,3998,3695,0
15931,50000,male,university,married,40,0,0,0,0,0,0,50536,9853,11353,12143,11753,11922,1200,4000,2000,2000,1000,1000,0
5480,70000,female,high_school,single,24,0,0,0,-1,0,0,8391,10242,11026,2342,2390,0,2000,1000,2342,48,0,0,0
121,80000,male,high_school,single,33,0,0,0,0,0,0,55424,56598,57724,58892,61991,63224,2068,2081,2113,5000,2264,2342,0
8080,190000,female,high_school,married,42,0,0,0,0,0,0,113516,116157,115639,115849,118313,119973,6000,5581,4237,4423,5000,8888,0
4304,360000,female,graduate_school,married,33,1,-2,-2,-1,0,0,-200,-200,0,13467,17800,0,0,200,13467,5300,0,0,0
18079,400000,female,university,married,45,-2,-2,-1,-1,-1,-1,10985,2441,16046,2483,10719,1722,2441,16046,2483,10719,1722,2836,0
10187,500000,male,graduate_school,single,31,1,-2,-1,0,0,0,0,0,399,35528,10389,0,0,399,35129,0,0,1307,0


In [8]:
training['PAY_0'].value_counts() / training.shape[0]

 0    0.489479
-1    0.191667
 1    0.123594
-2    0.094427
 2    0.086146
 3    0.010729
 4    0.002292
 5    0.000625
 8    0.000417
 7    0.000313
 6    0.000313
Name: PAY_0, dtype: float64

In [9]:
testing['PAY_0'].value_counts() / testing.shape[0]

 0    0.493750
-1    0.186667
 1    0.115000
 2    0.099375
-2    0.089583
 3    0.009375
 4    0.002917
 5    0.001667
 8    0.001250
 6    0.000417
Name: PAY_0, dtype: float64

In [10]:
test['PAY_0'].value_counts() / test.shape[0]

 0    0.494833
-1    0.185000
 1    0.127167
 2    0.089333
-2    0.086000
 3    0.011833
 4    0.003000
 5    0.001000
 8    0.000833
 7    0.000500
 6    0.000500
Name: PAY_0, dtype: float64

In [11]:
testing.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
2,270000,female,university,single,32,0,0,0,0,0,0,59710,49986,104390,94856,86461,83650,1808,69563,2891,2689,3012,2771,0
5,50000,female,university,married,34,1,2,0,0,0,0,49378,48052,48573,28902,28205,30388,0,2033,1119,985,2000,4000,0
9,100000,female,graduate_school,single,31,-2,-2,-2,-2,-2,-2,1500,3794,-1678,-1478,1058,5556,3794,0,200,4058,5738,21173,0
10,50000,female,university,single,24,0,0,0,0,0,0,50870,49831,47162,47696,36476,32654,1985,2316,1850,1500,1200,5196,0
11,50000,female,university,single,39,0,0,-1,-1,0,0,25254,26366,510,16121,17539,19088,2000,510,16121,2000,2000,998,0


In [12]:
training['SEX'].value_counts() / training.shape[0]

female    0.60849
male      0.39151
Name: SEX, dtype: float64

In [13]:
testing['SEX'].value_counts() / testing.shape[0]

female    0.599583
male      0.400417
Name: SEX, dtype: float64

In [14]:
training['EDUCATION'].value_counts() / training.shape[0]

university         0.470677
graduate_school    0.352969
high_school        0.161094
other_unknown      0.015260
Name: EDUCATION, dtype: float64

In [15]:
testing['EDUCATION'].value_counts() / testing.shape[0]

university         0.466250
graduate_school    0.347083
high_school        0.172083
other_unknown      0.014583
Name: EDUCATION, dtype: float64

In [16]:
training['MARRIAGE'].value_counts() / training.shape[0]

single     0.533177
married    0.453802
other      0.013021
Name: MARRIAGE, dtype: float64

In [17]:
testing['MARRIAGE'].value_counts() / testing.shape[0]

single     0.520833
married    0.467500
other      0.011667
Name: MARRIAGE, dtype: float64

In [18]:
training['PAY_2'].value_counts() / training.shape[0]

 0    0.522760
-1    0.202604
 2    0.130625
-2    0.128802
 3    0.009896
 4    0.002760
 1    0.000990
 5    0.000677
 7    0.000417
 6    0.000417
 8    0.000052
Name: PAY_2, dtype: float64

In [19]:
testing['PAY_2'].value_counts() / testing.shape[0]

 0    0.527500
-1    0.205208
 2    0.127500
-2    0.118542
 3    0.014375
 4    0.004583
 7    0.001250
 5    0.000833
 1    0.000208
Name: PAY_2, dtype: float64

In [20]:
train['PAY_0'].value_counts()

 0    11768
-1     4576
 1     2925
-2     2243
 2     2131
 3      251
 4       58
 5       20
 8       14
 6        8
 7        6
Name: PAY_0, dtype: int64

In [5]:
## Chaning labels


## Changing labels to dummies 
training = pd.concat([training.drop(columns = 'SEX', axis = 1), pd.get_dummies(training['SEX'])], axis = 1)
training.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,1,2
802,20000,3,1,51,2,0,0,0,0,0,14815,15837,16852,17188,17546,17895,1264,1279,615,635,639,664,0,1,0
9908,50000,2,1,45,0,0,0,0,0,0,49073,12630,9215,9215,8011,5615,2022,2000,0,1000,1904,1664,0,0,1
483,20000,2,2,36,0,0,2,2,3,2,11423,13976,13455,15227,14699,14313,2749,0,2000,0,0,1230,0,1,0
2566,160000,1,1,53,0,0,0,0,0,0,52607,53548,56214,57246,58443,59883,2834,3500,2500,2207,2500,2300,0,0,1
11838,70000,2,1,34,0,0,0,0,0,0,50488,47018,46871,41821,41344,41376,2200,2015,2000,1500,1700,2000,0,0,1


In [None]:
## Strong heredity