## Question 5

In [20]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from sklearn.model_selection import train_test_split
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm
import pandas as pd

In [21]:
Default = load_data('Default')

In [22]:
Default.columns

Index(['default', 'student', 'balance', 'income'], dtype='object')

In [23]:
vars = Default.columns.drop(['default', 'student'])
design = MS(vars)
X = design.fit_transform(Default)
y = Default.default == 'Yes'
glm = sm.GLM(y,
             X,
             family = sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
balance,0.0056,0.0,24.835,0.0
income,2.1e-05,5e-06,4.174,0.0


In [24]:
seed = np.random.seed(0)

In [25]:
Default_train, Default_valid = train_test_split(Default, test_size = 5000, random_state=0)

In [26]:
m_train = MS(vars)
X_train = m_train.fit_transform(Default_train)
y_train = Default_train.default == 'Yes'
model = sm.GLM(y_train, X_train, family = sm.families.Binomial())
results = model.fit()

In [27]:
X_valid = m_train.fit_transform(Default_valid)
y_valid = Default_train.default == 'Yes'
valid_pred = results.predict(X_valid)
labels = np.array(['No']*5000)
labels[valid_pred>0.5] = 'Yes'
np.mean(labels != Default_valid.default)

0.0398

In [31]:
Default_train_numeric = pd.get_dummies(Default_train, columns = ['student'])
vars_2 = Default_train_numeric.columns.drop(['default'])
m_train = MS(vars_2)
X_train = m_train.fit_transform(Default_train_numeric)
y_train = Default_train.default == 'Yes'
model = sm.GLM(y_train, X_train, family = sm.families.Binomial())
results = model.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-7.2943,0.437,-16.704,0.0
balance,0.0057,0.0,16.74,0.0
income,-9e-06,1.2e-05,-0.732,0.464
student_No,-3.2012,0.304,-10.522,0.0
student_Yes,-4.093,0.249,-16.435,0.0


In [34]:
Default_valid_numeric = pd.get_dummies(Default_valid, columns = ['student'])
X_valid = m_train.fit_transform(Default_valid_numeric)
y_valid = Default_train_numeric.default == 'Yes'
valid_pred = results.predict(X_valid)
labels = np.array(['No']*5000)
labels[valid_pred>0.5] = 'Yes'
np.mean(labels != Default_valid_numeric.default)

0.0398

## Question 6

In [35]:
del vars, design, X, y, glm, results

In [100]:
Default_numeric['default'] = (Default['default'] == 'Yes').astype(int)
Default_numeric['student'] = (Default['student'] == 'Yes').astype(int)
Default_numeric

Unnamed: 0,default,balance,income,student_Yes,student
0,0,729.526495,44361.625074,0,0
1,0,817.180407,12106.134700,1,1
2,0,1073.549164,31767.138947,0,0
3,0,529.250605,35704.493935,0,0
4,0,785.655883,38463.495879,0,0
...,...,...,...,...,...
9995,0,711.555020,52992.378914,0,0
9996,0,757.962918,19660.721768,0,0
9997,0,845.411989,58636.156984,0,0
9998,0,1569.009053,36669.112365,0,0


In [91]:
vars = Default_numeric.columns.drop(['default', 'student', 'student_Yes'])
design = MS(vars)
X = design.fit_transform(Default_numeric)
y = Default.default == 'Yes'
glm = sm.GLM(y,
             X,
             family = sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
balance,0.0056,0.0,24.835,0.0
income,2.1e-05,5e-06,4.174,0.0


In [92]:
def boot_fn(model_matrix, response, D, idx):
    D_ = D.loc[idx]
    Y_ = D_[response]
    X_ = clone(model_matrix).fit_transform(D_)
    return sm.GLM(Y_, X_, family = sm.families.Binomial()).fit().params

In [93]:
hp_func = partial(boot_fn, MS(vars), 'default')

In [94]:
rng = np.random.default_rng(0)
np.array([hp_func(Default_numeric,
          rng.choice(10000,
                     10000,
                     replace=True)) for _ in range(10)])

array([[-1.16416373e+01,  5.73877605e-03,  1.87775777e-05],
       [-1.27619965e+01,  6.16200434e-03,  3.20594655e-05],
       [-1.12850364e+01,  5.61832222e-03,  1.59221870e-05],
       [-1.09975828e+01,  5.41168597e-03,  1.40723398e-05],
       [-1.13173469e+01,  5.70216361e-03,  1.12728778e-05],
       [-1.17516107e+01,  5.83443562e-03,  1.85974460e-05],
       [-1.12884834e+01,  5.53172383e-03,  1.52822182e-05],
       [-1.13883312e+01,  5.70192972e-03,  1.73720495e-05],
       [-1.11098351e+01,  5.28010522e-03,  2.33921172e-05],
       [-1.10505563e+01,  5.46083916e-03,  1.50937413e-05]])

In [95]:
hp_se = boot_SE(hp_func,
                Default_numeric,
                B=1000,
                seed=10)
hp_se

intercept    0.425280
balance      0.000227
income       0.000005
dtype: float64

In [105]:
Default = load_data('Default')
Default_numeric = pd.get_dummies(Default, columns=['student'], drop_first=True)
Default_numeric['default'] = (Default['default'] == 'Yes').astype(int)
Default_numeric

Unnamed: 0,default,balance,income,student_Yes
0,0,729.526495,44361.625074,0
1,0,817.180407,12106.134700,1
2,0,1073.549164,31767.138947,0
3,0,529.250605,35704.493935,0
4,0,785.655883,38463.495879,0
...,...,...,...,...
9995,0,711.555020,52992.378914,0
9996,0,757.962918,19660.721768,0
9997,0,845.411989,58636.156984,0
9998,0,1569.009053,36669.112365,0


## Question 7