In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn import linear_model
from sklearn import preprocessing
from matplotlib import rcParams

%matplotlib inline
sns.set_style('darkgrid')
rcParams['figure.figsize'] = 10, 7

In [2]:
# load the data again. Keep air quality data, drop the index column and any missing data columns

df = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/credit-card-defaults.csv').iloc[:,1:].dropna()

In [3]:
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [4]:
# Recode strings to numeric.
df['default'] = np.where(df['default'] == 'Yes', 1, 0)
df['student'] = np.where(df['student'] == 'Yes', 1, 0)
names = df.columns
df = pd.DataFrame(preprocessing.scale(df), columns=names)

  """


In [5]:
df.head()

Unnamed: 0,default,student,balance,income
0,-0.185599,-0.645936,-0.218835,0.813187
1,-0.185599,1.548141,-0.037616,-1.605496
2,-0.185599,-0.645936,0.49241,-0.131212
3,-0.185599,-0.645936,-0.632893,0.164031
4,-0.185599,-0.645936,-0.102791,0.370915


In [6]:
# Define the training and test sizes.
trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

Y_train = df_train['income'].values.reshape(-1, 1)
X_train = df_train.loc[:, ~(df_train.columns).isin(['income'])]

In [7]:
# Make some new features to capture potential quadratic and cubic relationshps
df_train['balance_student'] = df_train['balance'] * df['student']
df_train['balance_default'] = df_train['balance'] * df['default']
df_train['student_default'] = df['student'] * df['default']
df_train['balance_sqrt'] = (df['balance'] + 100) * .5
df_train['balance2'] = (df['balance'] + 100) ** 2
df_train['balance3'] = (df['balance'] + 100) ** 3

In [8]:
X_train2 = df_train.loc[:, ~(df_train.columns).isin(['income'])]

In [9]:
# Test the simpler mdoel with smaller coefficients
Y_test = df_test['income'].values.reshape(-1, 1)
X_test = df_test.loc[:, ~(df_test.columns).isin(['income'])]



In [10]:
df_test['balance_student'] = df_test['balance'] * df_test['student']
df_test['balance_default'] = df_test['balance'] * df_test['default']
df_test['student_default'] = df_test['student'] * df_test['default']
df_test['balance_sqrt'] = (df_test['balance'] + 100) ** .5
df_test['balance2'] = (df_test['balance'] + 100) ** 2
df_test['balance3'] = (df_test['balance'] + 100) ** 3
X_test2 = df_test.loc[:, ~(df_test.columns).isin(['income'])]

In [11]:
# Small number of parameters.
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, Y_train)
print('R² for the model with few features:')
print(lass.score(X_train, Y_train))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(origparams)

# Large number of parameters.
lassBig = linear_model.Lasso(alpha=.35)
lassBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(lassBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)

R² for the model with few features:
0.4500625793011847

Parameter estimates for the model with few features:
[-0.         -0.40657726 -0.          0.00114596]

R² for the model with many features:
0.44363376712897057

Parameter estimates for the model with many features:
[ 0.00000000e+00 -3.89351238e-01  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -2.77688887e-04
 -7.09158792e-07  3.48711577e+00]


In [12]:
print(lass.score(X_test, Y_test))

print(lassBig.score(X_test2, Y_test))

0.44553225151184195
0.4380466345914473


In [15]:
# store estimates
#estimates = np.append(ridg)

for lambd in range(1, 50, 2):
    ridge = linear_model.Ridge(alpha=lambd, fit_intercept=False)
    lasso = linear_model.Lasso(alpha=lambd/50, fit_intercept=False)
    ridge.fit(X_train2, Y_train)
    lasso.fit(X_train2, Y_train)
    print('Ridge Regression with lambda of {}:'.format(lambd))
    print(ridge.score(X_test2, Y_test))
    print('Lasso Regression with lambda of {}:'.format(lambd/50))
    print(lasso.score(X_test2,Y_test))
    print()



Ridge Regression with lambda of 1:
-2.8309454262006724
Lasso Regression with lambda of 0.02:
0.5632640522994256





Ridge Regression with lambda of 3:
-0.6947304657561983
Lasso Regression with lambda of 0.06:
0.5607762796171338





Ridge Regression with lambda of 5:
-0.08419838160290949
Lasso Regression with lambda of 0.1:
0.5550404902689986





Ridge Regression with lambda of 7:
0.17066471989873966
Lasso Regression with lambda of 0.14:
0.5460450917255846





Ridge Regression with lambda of 9:
0.30058290609959026
Lasso Regression with lambda of 0.18:
0.533790083833302





Ridge Regression with lambda of 11:
0.3755743579649563
Lasso Regression with lambda of 0.22:
0.518275469357292





Ridge Regression with lambda of 13:
0.4227052205801725
Lasso Regression with lambda of 0.26:
0.49950124598939566





Ridge Regression with lambda of 15:
0.4542210575661558
Lasso Regression with lambda of 0.3:
0.4774674137722105





Ridge Regression with lambda of 17:
0.47631482682561654
Lasso Regression with lambda of 0.34:
0.4521739661286972





Ridge Regression with lambda of 19:
0.49239058411352143
Lasso Regression with lambda of 0.38:
0.4236209081062016





Ridge Regression with lambda of 21:
0.5044447183187497
Lasso Regression with lambda of 0.42:
0.39180823970472467





Ridge Regression with lambda of 23:
0.5137104786112433
Lasso Regression with lambda of 0.46:
0.35673596369469374





Ridge Regression with lambda of 25:
0.5209828385529729
Lasso Regression with lambda of 0.5:
0.31840412256479267





Ridge Regression with lambda of 27:
0.5267928170065537
Lasso Regression with lambda of 0.54:
0.27681263192516115





Ridge Regression with lambda of 29:
0.53150598954985
Lasso Regression with lambda of 0.58:
0.2319615316228171





Ridge Regression with lambda of 31:
0.5353806454263735
Lasso Regression with lambda of 0.62:
0.18385082165775257





Ridge Regression with lambda of 33:
0.5386034323002917
Lasso Regression with lambda of 0.66:
0.1324805020299753





Ridge Regression with lambda of 35:
0.5413119200034826
Lasso Regression with lambda of 0.7:
0.07785057273948115





Ridge Regression with lambda of 37:
0.5436092918807767
Lasso Regression with lambda of 0.74:
0.019961033786270233





Ridge Regression with lambda of 39:
0.5455741538091633
Lasso Regression with lambda of 0.78:
0.003965926190190094





Ridge Regression with lambda of 41:
0.5472672296089427
Lasso Regression with lambda of 0.82:
0.003956285849836871





Ridge Regression with lambda of 43:
0.548736023083988
Lasso Regression with lambda of 0.86:
0.0039466425518782655





Ridge Regression with lambda of 45:
0.5500181215068853
Lasso Regression with lambda of 0.9:
0.003936996296315387





Ridge Regression with lambda of 47:
0.5511435765126644
Lasso Regression with lambda of 0.94:
0.003927347083147348

Ridge Regression with lambda of 49:
0.55213664366349
Lasso Regression with lambda of 0.98:
0.003917694912374925



