In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [5]:
# Load the data again. Keep air quality data, drop the index column
# and any missing data columns.
df = pd.read_csv(
    'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/credit-card-defaults.csv'
).iloc[:,1:].dropna()

# Recode strings to numeric.
df['default'] = np.where(df['default']=='Yes', 1, 0)
df['student'] = np.where(df['student']=='Yes', 1, 0)
names = df.columns
df = pd.DataFrame(preprocessing.scale(df), columns=names)

# Define the training and test sizes.
trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

Y_train = df_train['income'].values.reshape(-1, 1)
X_train = df_train.loc[:, ~(df_train.columns).isin(['income'])]

# Make some new features to capture potential quadratic and cubic
# relationships between solar radiation and day or temperature.
df_train['balance_student'] = df_train['balance'] * df_train['student']
df_train['balance_default'] = df_train['balance'] * df_train['default']
df_train['student_default'] = df_train['student'] * df_train['default']
df_train['balance_sqrt'] = (df_train['balance'] + 100) ** .5
df_train['balance2'] = (df_train['balance'] + 100) ** 2
df_train['balance3'] = (df_train['balance'] + 100) ** 3

X_train2 = df_train.loc[:, ~(df_train.columns).isin(['income'])]

# Test the simpler model with smaller coefficients.
Y_test = df_test['income'].values.reshape(-1, 1)
X_test = df_test.loc[:, ~(df_test.columns).isin(['income'])]

# Test the more complex model with larger coefficients.
df_test['balance_student'] = df_test['balance'] * df_test['student']
df_test['balance_default'] = df_test['balance'] * df_test['default']
df_test['student_default'] = df_test['student'] * df_test['default']
df_test['balance_sqrt'] = (df_test['balance'] + 100) ** .5
df_test['balance2'] = (df_test['balance'] + 100) ** 2
df_test['balance3'] = (df_test['balance'] + 100) ** 3
X_test2 = df_test.loc[:, ~(df_test.columns).isin(['income'])]

In [14]:
print('Features in X_train: {0}'.format(len(X_train.columns)))
print('Features in X_train2: {0}'.format(len(X_train2.columns)))

Features in X_train: 3
Features in X_train2: 9


# Testing differnt values lambda

In [47]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from itertools import product

alphas = (0, 0.01, 0.1, 0.5, 1, 5, 10)
for i in alphas:
    lassBig = Lasso(alpha=i)
    lassBig.fit(X_train2, Y_train)
    print('\nModel R² for lambda={0}:'.format(i))
    print(lassBig.score(X_train2, Y_train))
    print('Test R² for lambda={0}'.format(i))
    print(lassBig.score(X_test2, Y_test))
    origparams = np.append(lassBig.coef_, lassBig.intercept_)
    print('Parameter estimates:')
    print(origparams)

  
  positive)
  positive)



Model R² for lambda=0:
0.5739444483155421
Test R² for lambda=0
0.5631593020584263
Parameter estimates:
[-2.06111494e-03 -7.58818422e-01  6.04810006e-02 -3.53546655e-03
  9.26788633e-03 -3.79292001e-03  1.76197742e-01 -2.05761745e-04
 -9.11532536e-07  1.20391468e+00]

Model R² for lambda=0.01:
0.5737681044618193
Test R² for lambda=0.01
0.5633389141249676
Parameter estimates:
[ 0.00000000e+00 -7.49175475e-01  0.00000000e+00 -0.00000000e+00
  4.64868670e-03 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -1.45217294e-09 -1.52452354e-03]

Model R² for lambda=0.1:
0.5630894936716484
Test R² for lambda=0.1
0.5546727927200348
Parameter estimates:
[ 0.00000000e+00 -6.54093117e-01  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -8.12596514e-05
 -6.65147234e-08  8.78456768e-01]

Model R² for lambda=0.5:
0.3082496073309633
Test R² for lambda=0.5
0.3027330888299672
Parameter estimates:
[ 0.00000000e+00 -2.30499742e-01  0.00000000e+00 -0.00000000e+00
  0.00000000e

# Conclusion
It looks like a very small value for lambda is preferable for this particular dataset. In fact, the highest test R2 was achieved with lambda=0.