## EDA/Cleaning 
(at least four)
(1) distplot/histogram
(2) dtypes
(3) isnull
(4) get_dummies
(5) map
(6) labelencoder

In [1]:
import pandas as pd
credit = pd.read_csv('/users/nick/desktop/CreditDefault.csv')

In [2]:
credit.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2.0,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2.0,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2.0,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2.0,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2.0,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
credit.set_index('ID', inplace=True)

* 1 Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. 
* 2 Gender (1 = male; 2 = female). 
* 3 Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). 
* 4 Marital status (1 = married; 2 = single; 3 = others). 
* 5 Age (year). 
* 6 = the repayment status in September, 2005
* 7:11 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. 
* 12 = amount of bill statement in September, 2005; 
* 13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. 
* 18 = amount paid in September, 2005
* 19 = amount paid in August, 2005
* 20 = amount paid in July, 2005
* 21 = amount paid in June, 2005
* 22 = amount paid in May, 2005
* 23 = amount paid in April, 2005

In [4]:
# Currently, female = 2 and male = 1. Relabeling so female = 1, male = 0 (making this categorical rather than
# ordinal).
credit['SEX'] = credit['SEX'].map(lambda x: 1 if x == 2 else 0)

In [5]:
# The did-you-default? column name is unwieldy, so I'm shortening it.

credit.rename(columns={'default payment next month':'DEFAULT'}, inplace=True)

In [6]:
# Frankly, this output is too monstrous to really parse. But the key is to look at the last column.
# Defaulting is strongly correlated with payment status (PAY_0 through PAY_6). 

credit.corr()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
LIMIT_BAL,1.0,0.024755,-0.21865,-0.108139,0.144713,-0.271214,-0.296382,-0.286123,-0.26746,-0.249411,...,0.293988,0.295562,0.290389,0.195236,0.178408,0.210167,0.203242,0.217202,0.219595,-0.15352
SEX,0.024755,1.0,0.017863,-0.031389,-0.090874,-0.057643,-0.070771,-0.066096,-0.060173,-0.055064,...,-0.02188,-0.017005,-0.016733,-0.000242,-0.001391,-0.008597,-0.002229,-0.001667,-0.002766,-0.039961
EDUCATION,-0.21865,0.017863,1.0,-0.14761,0.18024,0.117955,0.113306,0.100012,0.09442,0.08414,...,-0.016039,-0.02327,-0.024548,-0.046238,-0.037953,-0.048034,-0.046838,-0.047481,-0.043411,0.036872
MARRIAGE,-0.108139,-0.031389,-0.14761,1.0,-0.41417,0.019917,0.024199,0.032688,0.033122,0.035629,...,-0.023344,-0.025393,-0.021207,-0.005979,-0.008093,-0.003541,-0.012659,-0.001205,-0.006641,-0.024339
AGE,0.144713,-0.090874,0.18024,-0.41417,1.0,-0.039447,-0.050148,-0.053048,-0.049722,-0.053826,...,0.051353,0.049345,0.047613,0.026147,0.021785,0.029247,0.021379,0.02285,0.019478,0.01389
PAY_0,-0.271214,-0.057643,0.117955,0.019917,-0.039447,1.0,0.672164,0.574245,0.538841,0.509426,...,0.179125,0.180635,0.17698,-0.079269,-0.070101,-0.070561,-0.064005,-0.05819,-0.058673,0.324794
PAY_2,-0.296382,-0.070771,0.113306,0.024199,-0.050148,0.672164,1.0,0.766552,0.662067,0.62278,...,0.222237,0.221348,0.219403,-0.080701,-0.05899,-0.055901,-0.046858,-0.037093,-0.0365,0.263551
PAY_3,-0.286123,-0.066096,0.100012,0.032688,-0.053048,0.574245,0.766552,1.0,0.777359,0.686775,...,0.227202,0.225145,0.222327,0.001295,-0.066793,-0.053311,-0.046067,-0.035863,-0.035861,0.235253
PAY_4,-0.26746,-0.060173,0.09442,0.033122,-0.049722,0.538841,0.662067,0.777359,1.0,0.819835,...,0.245917,0.242902,0.239154,-0.009362,-0.001944,-0.069235,-0.043461,-0.03359,-0.026565,0.216614
PAY_5,-0.249411,-0.055064,0.08414,0.035629,-0.053826,0.509426,0.62278,0.686775,0.819835,1.0,...,0.271915,0.269783,0.262509,-0.006089,-0.003191,0.009062,-0.058299,-0.033337,-0.023027,0.204149


In [7]:
credit.EDUCATION.value_counts()

2.0    12939
1.0     9110
3.0     4500
5.0      256
4.0      103
6.0       43
0.0       13
Name: EDUCATION, dtype: int64

In [None]:
'''Re: education values, the codebook detailed (1 = graduate school; 2 = university; 3 = high school; 4 = others).
That leaves me in the dark re: 0, 5, 6. I'm going to lump them with 4 ('others'), which will, if nothing else, 
aide interpretability.'''

In [8]:
# Let's take a look at the default percentages for each education level.
credit.groupby('EDUCATION').DEFAULT.sum() / credit.groupby('EDUCATION').DEFAULT.count()

EDUCATION
0.0    0.000000
1.0    0.182547
2.0    0.237035
3.0    0.251333
4.0    0.048544
5.0    0.070312
6.0    0.162791
Name: DEFAULT, dtype: float64

In [9]:
credit.EDUCATION = credit.EDUCATION.map(lambda x: 4 if x in [0, 5, 6] else x)

In [10]:
credit.isnull().sum()

LIMIT_BAL       0
SEX             0
EDUCATION    3036
MARRIAGE        0
AGE             0
PAY_0           0
PAY_2           0
PAY_3           0
PAY_4           0
PAY_5           0
PAY_6           0
BILL_AMT1       0
BILL_AMT2    2543
BILL_AMT3       0
BILL_AMT4       0
BILL_AMT5       0
BILL_AMT6       0
PAY_AMT1        0
PAY_AMT2        0
PAY_AMT3        0
PAY_AMT4        0
PAY_AMT5        0
PAY_AMT6        0
DEFAULT         0
dtype: int64

In [11]:
# I'm going to drop these values rather than fill them in. Since this is an unbalanced dataset (only about a 
# quarter of observations defaulted), I worry that filling missing values with the median value of that column
# would skew the predicted values of those rows towards not defaulting.

credit.dropna(subset=['EDUCATION'], how='any', inplace = True)

In [12]:
credit.dropna(subset=['BILL_AMT2'], how='any', inplace = True)


In [13]:
# This is the average default percentage by education level.

credit.groupby('EDUCATION').DEFAULT.sum() / credit.groupby('EDUCATION').DEFAULT.count() 

EDUCATION
1.0    0.182553
2.0    0.239032
3.0    0.253434
4.0    0.075758
Name: DEFAULT, dtype: float64

In [14]:
# This is the overall default percentage.

float(credit.DEFAULT.sum() / float(credit.DEFAULT.count()))

0.22016516228154406

## Get Dummies

In [15]:
credit.MARRIAGE.value_counts()

2    13922
1    11777
3      292
0       44
Name: MARRIAGE, dtype: int64

In [16]:
# The default percentage according to whether you're single, married or other (or 0?).

credit.groupby('MARRIAGE').DEFAULT.sum() / credit.groupby('MARRIAGE').DEFAULT.count()

MARRIAGE
0    0.113636
1    0.232997
2    0.208806
3    0.260274
Name: DEFAULT, dtype: float64

In [17]:
# I don't know what 0 corresponds to, so I'm adding lumping it into 3, which is okay since 3 = 'others'. 
# Also helpful: category 0 isn't very large, so this is a relatively minor manipulation. 

credit.MARRIAGE = credit.MARRIAGE.map(lambda x: 3 if x == 0 else x)

In [18]:
credit.groupby('MARRIAGE').DEFAULT.sum() / credit.groupby('MARRIAGE').DEFAULT.count()

MARRIAGE
1    0.232997
2    0.208806
3    0.241071
Name: DEFAULT, dtype: float64

In [19]:
# Passing the DataFrame to 'get_dummies' for 'EDUCATION' and 'MARRIAGE.'

credit = pd.get_dummies(credit, columns=['EDUCATION', 'MARRIAGE'])

In [20]:
# I'm dropping Education 2 and Marriage 2 because they had the greatest number of observations. (It's standard
# to drop the dummy column with the greatest number of observations.)

credit.drop(['EDUCATION_2.0', 'MARRIAGE_2'], axis=1, inplace=True)

In [21]:
# For those following along, this will seem like a weird maneuver. I'm just trying to set the whether-or-not-
# you-defaulted column as the last column in my dataframe (each new column automatically becomes the last column 
# in your dataframe).

credit['default'] = credit['DEFAULT']

In [22]:
# Now that that's done, I don't need this column twice. 

credit.drop('DEFAULT', axis=1, inplace=True)

In [23]:
credit.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,EDUCATION_1.0,EDUCATION_3.0,EDUCATION_4.0,MARRIAGE_1,MARRIAGE_3,default
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,1,24,2,2,-1,-1,-2,-2,3913,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1
2,120000,1,26,-1,2,0,0,0,2,2682,...,1000,1000,0,2000,0.0,0.0,0.0,0.0,0.0,1
3,90000,1,34,0,0,0,0,0,0,29239,...,1000,1000,1000,5000,0.0,0.0,0.0,0.0,0.0,0
4,50000,1,37,0,0,0,0,0,0,46990,...,1200,1100,1069,1000,0.0,0.0,0.0,1.0,0.0,0
5,50000,0,57,-1,0,-1,0,0,0,8617,...,10000,9000,689,679,0.0,0.0,0.0,1.0,0.0,0


## Model Prep
(three)
(1) Train, Test Split (Required)
(2) Regularization
(3) Standardization
(4) Feature Scaling
(5) Feature Engineering (making new features)

## KNN

In [29]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn import metrics

In [25]:
features = list(credit.columns)

In [26]:
features.remove('default')

In [27]:
X = credit[features]
y = credit.default

In [28]:
# Scale your data! (I'm not scaling y since y is binary.) I know that some X values are also binary, but I read here
# http://stats.stackexchange.com/questions/37511/normalization-of-categorical-factor-variables 
# that while unnecessary, "Normalizing categorical variables ought not have any real effect."

X = preprocessing.scale(X)

In [30]:
# Because I have an unbalanced dataset, it's important to stratify the folds so that each fold has a proportional
# number of 0s (didn't default) and 1s (did default).

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

In [31]:
# Set n_neighbors equal to one, just to get this thing up and running.

knn = KNeighborsClassifier(n_neighbors=1)

In [32]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [33]:
# If I guessed that nobody ever defaulted, I'd have an accuracy score of 78%.

y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)

0.734675065294


In [34]:
# Optimize the number of neighbors. The ceiling (25) is arbitrary -- but you have to stop somewhere. 

friendly_neighbors = {}

for i in range(1, 25):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred_class = knn.predict(X_test)
    model_score = metrics.accuracy_score(y_test, y_pred_class)
    friendly_neighbors[i] = model_score  
    
max(friendly_neighbors, key=friendly_neighbors.get)  

20

In [35]:
# Run the model again with number of neighbors equal to 20 (optimal number, found in code above). Notice 
# improved score.

knn = KNeighborsClassifier(n_neighbors = 20)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)

0.81364264864


## Logistic Regression

In [36]:
# Trying all this again, instead with logistic ingression. This will be a bare-bones logistic regression.

from sklearn.linear_model import LogisticRegression

In [37]:
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [39]:
y_pp = pd.DataFrame(logreg.predict_proba(X_test), columns=['class_0_pp','class_1_pp'])
print y_pp.head()

   class_0_pp  class_1_pp
0    0.930533    0.069467
1    0.955002    0.044998
2    0.810401    0.189599
3    0.880879    0.119121
4    0.927735    0.072265


In [40]:
y_pp['pred_class'] = y_pred
y_pp.pred_class.value_counts()

0    5877
1     632
Name: pred_class, dtype: int64

In [None]:
# In my testing set, I predicted that 632 of 6509 (9.7%) would default. Not very good ... remember, the overall 
# average is around 22%.

In [119]:
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
print classification_report(y_test, y_pred, target_names=['Did Not Default', 'Defaulted'])

                 precision    recall  f1-score   support

Did Not Default       0.83      0.96      0.89      5076
      Defaulted       0.69      0.31      0.42      1433

    avg / total       0.80      0.82      0.79      6509



In [55]:
confusion_matrix = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))
confusion_matrix = pd.DataFrame(confusion_matrix, index=['Defaulted', 'Did Not Default'], columns=['Predicted Default', 'Did Not Predict Default'])
print confusion_matrix

                 Predicted Default  Did Not Predict Default
Defaulted                      438                      995
Did Not Default                194                     4882


## Logistic Regression Cross-Validation

In [None]:
# Cross-validating to get an ideal C score ... C being the inverse of alpha, so a small C means you're 
# more harshly regularizing your model. I'm trying Lasso because I want some feature selection. 
# (The previous logistic regression used Ridge regression, which is the default setting.)

In [56]:
from sklearn.linear_model import LogisticRegressionCV
lr_lasso_cv = LogisticRegressionCV(solver='liblinear', penalty='l1')
lr_lasso_cv.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [57]:
# This is my C score (the C value averaged across all 'models' fit to cross-validating the folds).
lr_lasso_cv.C_

array([ 21.5443469])

In [58]:
# These are the feature coefficients averaged across all 'models' fit to cross-validating teh folds. Unfortunately --
# no features were eliminated!
lr_lasso_cv.coef_

array([[-0.18224111, -0.06163715,  0.01372483,  0.68381342,  0.09208398,
         0.11412805,  0.02395349,  0.07121473,  0.02273099, -0.30982495,
         0.0565092 ,  0.19578602,  0.02834424, -0.01459704, -0.00686008,
        -0.15316131, -0.21262588, -0.00532253, -0.03612815, -0.00472619,
        -0.02214601,  0.04179621, -0.01783446, -0.12332364,  0.12583269,
         0.02028048]])

In [59]:
# The average of all these Cs would equal the C score above.
lr_lasso_cv.Cs_

array([  1.00000000e-04,   7.74263683e-04,   5.99484250e-03,
         4.64158883e-02,   3.59381366e-01,   2.78255940e+00,
         2.15443469e+01,   1.66810054e+02,   1.29154967e+03,
         1.00000000e+04])

In [60]:
# These were the coefficients calculated on each 'model', which, averaged toether, equal the coefficients above.
lr_lasso_cv.coefs_paths_

{1: array([[[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
         [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            2.39500875e-01,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
            0.00000000e+00,   0.0000

In [64]:
# Comparable to the KNN score! 
lr_lasso_cv.scores_

{1: array([[ 0.77984329,  0.78137963,  0.81333538,  0.81871255,  0.81963435,
          0.81963435,  0.81978799,  0.81978799,  0.81978799,  0.81978799],
        [ 0.77984329,  0.78137963,  0.81333538,  0.82040252,  0.81948072,
          0.81932709,  0.81932709,  0.81932709,  0.81932709,  0.81932709],
        [ 0.77980947,  0.78149969,  0.81484327,  0.81883835,  0.81929932,
          0.81960664,  0.81960664,  0.81960664,  0.81960664,  0.81960664]])}

In [66]:
# Refitting my linear model with a loss function (Lasso -- L1) and an alpha (1/C). 
lr_lasso = LogisticRegression(solver='liblinear', penalty='l1', C=lr_lasso_cv.C_[0])

In [67]:
from sklearn.cross_validation import cross_val_score

In [68]:
# Little better than a blind guess.
scores = cross_val_score(lr_lasso, X_train, y_train, cv=5, scoring='average_precision')
print np.mean(scores)

0.528905191852


In [70]:
# I want to know see how many defaults this (amended) model will predict. This is a rehash of steps 
# already seen.

lr_lasso.fit(X_train, y_train)
y_pred = lr_lasso.predict(X_test)
y_pp = pd.DataFrame(lr_lasso.predict_proba(X_test), columns=['class_0_pp','class_1_pp'])
y_pp['pred_class'] = y_pred
y_pp.pred_class.value_counts()

0    5877
1     632
Name: pred_class, dtype: int64

In [None]:
# The result is the same -- the model predicts a default rate of 9.7% in the testing set. 

## Optimization 
(two)
(1) GridSearch
(2) One univariate feature selection method
(3) one recursive feature selection method

In [81]:
from sklearn.grid_search import GridSearchCV
logreg = LogisticRegression(random_state=1)

In [83]:
C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 21.5443469, 100.0, 1000.0]
penalties = ['l1','l2']
gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals})
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.15, 0.25, 0.275, 0.33, 0.5, 0.66, 0.75, 1.0, 2.5, 5.0, 10.0, 21.5443469, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [88]:
gs.best_estimator_

LogisticRegression(C=0.15, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [89]:
gs.best_score_

0.81962511523097403

In [90]:
# This recommends the default loss function - Ridge. It also recommended a different C value than the ~21 recommended
# from LogisticRegressionCV -- a much lower one, meaning harsher regularization.
gs.best_params_

{'C': 0.15, 'penalty': 'l2'}

In [98]:
# Once more, I want to know the number of predicted defaults in my test set to see if there's any major change.
lr_optimized = LogisticRegression(solver='liblinear', penalty='l2', C=0.15)
lr_optimized.fit(X_train, y_train)
y_pred = lr_optimized.predict(X_test)
y_pp = pd.DataFrame(lr_optimized.predict_proba(X_test), columns=['class_0_pp','class_1_pp'])
y_pp['pred_class'] = y_pred
y_pp.head()

Unnamed: 0,class_0_pp,class_1_pp,pred_class
0,0.929967,0.070033,0
1,0.954129,0.045871,0
2,0.80973,0.19027,0
3,0.880239,0.119761,0
4,0.92744,0.07256,0


In [99]:
# Frustratingly similar! 
y_pp.pred_class.value_counts()

0    5879
1     630
Name: pred_class, dtype: int64

## RFECV

In [100]:
# One last way to select features. 
from sklearn.feature_selection import RFECV

In [101]:
rfecv = RFECV(lr_optimized, step=1)
model = rfecv.fit(X_train, y_train)

In [102]:
# Surprisingly, this tells me I should use only one feature! 
model.n_features_

1

In [104]:
model.ranking_

array([ 7, 12, 23,  1, 11,  3, 16, 10, 17,  4, 13,  5, 20, 25, 21,  6,  2,
       24, 15, 26, 18, 14, 22,  8,  9, 19])

In [105]:
model.grid_scores_

array([ 0.82653908,  0.82479764,  0.81645022,  0.81803773,  0.82131519,
        0.82172493,  0.82187858,  0.82105897,  0.82003474,  0.81988111,
        0.81860088,  0.81926663,  0.81885691,  0.81942027,  0.81911296,
        0.81901053,  0.81906174,  0.81942024,  0.81962511,  0.8195739 ,
        0.81967633,  0.81957389,  0.81936904,  0.81947147,  0.81957389,
        0.81962511])

In [106]:
model.estimator_

LogisticRegression(C=0.15, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [108]:
features = ['PAY_0']
X = credit[features]

In [109]:
X = preprocessing.scale(X)
y = credit.default

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)
lr_optimized.fit(X_train, y_train)
y_pred = lr_optimized.predict(X_test)
y_pp = pd.DataFrame(lr_optimized.predict_proba(X_test), columns=['class_0_pp','class_1_pp'])
y_pp['pred_class'] = y_pred

In [111]:
y_pp.pred_class.value_counts()

0    5754
1     755
Name: pred_class, dtype: int64

In [None]:
# This resulted in a higher predicted default rate, 11.6%. Still far from perfect. 

## Results
(three)
(1) classification report
(2) confusion matrix
(3) accuracy score
(4) roc/auc
(5) precision/recall
(6) 'cross_val_score'
(7) error

In [114]:
print classification_report(y_test, y_pred, target_names=['Did Not Default', 'Defaulted'])

                 precision    recall  f1-score   support

Did Not Default       0.84      0.96      0.90      5076
      Defaulted       0.70      0.37      0.48      1433

    avg / total       0.81      0.83      0.80      6509



In [120]:
confusion_matrix = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))
confusion_matrix = pd.DataFrame(confusion_matrix, index=['Defaulted', 'Did Not Default'], columns=['Predicted Default', 'Did Not Predict Default'])
print confusion_matrix

                 Predicted Default  Did Not Predict Default
Defaulted                      527                      906
Did Not Default                228                     4848
