# Lab 7 by Nicholas Fong

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn import linear_model, cross_validation

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [2]:
df = pd.read_csv(os.path.join('bank-marketing.csv'))

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,...,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,...,1,-1,0,unknown,no
1,33,services,married,secondary,no,...,1,339,4,failure,no
2,35,management,single,tertiary,no,...,1,330,1,failure,no
3,30,management,married,tertiary,no,...,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,...,1,-1,0,unknown,no


The dataset is related to the direct marketing campaigns (by phone) of a Portuguese banking institution.  The classification goal is to predict if the client will subscribe a term deposit (variable y).

Attribute Information:

- Input variables:
  - [Bank client data]
    - `age` (numeric)
    - `job`: type of job (categorical)
    - `marital`: marital status (categorical)
      - Note: `divorced` means divorced or widowed)
    - `education` (categorical)
    - `default`: has credit in default? (categorical)
    - `balance`: bank account balance (\$)
    - `housing`: has housing loan? (categorical)
    - `loan`: has personal loan? (categorical)
  - [Data related with the last contact of the current campaign]
    - `contact`: contact communication type (categorical) 
    - `month`: last contact month of year (categorical)
    - `day_of_week`: last contact day of the week (categorical)
    - `duration`: last contact duration, in seconds (numeric)
      - Important note: this attribute highly affects the output target (e.g., if `duration = 0` then `y = 'no'`). Yet, the duration is not known before a call is performed.  Also, after the end of the call y is obviously known.  Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.    
  - [Other attributes]
    - `campaign`: number of contacts performed during this campaign and for this client (numeric)
    - `pdays`: number of days that passed by after the client was last contacted from a previous campaign (numeric)
      - 999 means client was not previously contacted
    - `previous`: number of contacts performed before this campaign and for this client (numeric)
    - `poutcome`: outcome of the previous marketing campaign (categorical)

- Output variable (desired target):
  - `y`: has the client subscribed a term deposit? (binary)

## Our goal is to develop a model the best predicts the outcome `y`, the success of the marketing campaign

## Question 1: Remove the categorical variables with the most number of distinct values

In [4]:
df.columns #the column name is day, not day_of_week -_-

Index([u'age', u'job', u'marital', u'education', u'default', u'balance',
       u'housing', u'loan', u'contact', u'day', u'month', u'duration',
       u'campaign', u'pdays', u'previous', u'poutcome', u'y'],
      dtype='object')

In [5]:
for x in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day', 'poutcome']:
    print x, len(df[x].value_counts())

job 12
marital 3
education 4
default 2
housing 2
loan 2
contact 3
month 12
day 31
poutcome 4


In [6]:
df.drop('day', axis = 1, inplace = True)
df.columns

Index([u'age', u'job', u'marital', u'education', u'default', u'balance',
       u'housing', u'loan', u'contact', u'month', u'duration', u'campaign',
       u'pdays', u'previous', u'poutcome', u'y'],
      dtype='object')

Answer: day has the most values among the categorical variables at 31, so we removed it

## Question 2: Recode all `yes`/`no` categorical variables with `0` as the most frequent value (then also append `"_no"` to the variable name), and `1` for the other (then leave the name unchanged)

In [7]:
for x in ['default', 'housing', 'loan', 'y']:
    if df[x].value_counts()['no'] >= df[x].value_counts()['yes']:
        df[x].replace(to_replace = 'no', value = 0, inplace = True)
        df[x].replace(to_replace = 'yes', value = 1, inplace = True)
    else:
        df[x].replace(to_replace = 'no', value = 1, inplace = True)
        df[x].replace(to_replace = 'yes', value = 0, inplace = True)
    df[x+'_no'] = 1 - df[x]
    
# Not fully sure what the quesiton is asking, but I'm treating it as saying that the most common element is mapped
# to a 0, while the other element is mapped to a 1, and then the _no column is the opposite. This essentially creates
# dummy variables, but no other interpretation of this makes any sense.

In [8]:
df

Unnamed: 0,age,job,marital,education,default,...,y,default_no,housing_no,loan_no,y_no
0,30,unemployed,married,primary,0,...,0,1,0,1,1
1,33,services,married,secondary,0,...,0,1,1,0,1
2,35,management,single,tertiary,0,...,0,1,1,1,1
3,30,management,married,tertiary,0,...,0,1,1,0,1
4,59,blue-collar,married,secondary,0,...,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,0,...,0,1,1,1,1
4517,57,self-employed,married,tertiary,1,...,0,0,1,0,1
4518,57,technician,married,secondary,0,...,0,1,0,1,1
4519,28,blue-collar,married,secondary,0,...,0,1,0,1,1


## Question 3: Create dummy variables for the other categorical variables

In [9]:
df = pd.get_dummies(df)
df

Unnamed: 0,age,default,balance,housing,loan,...,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,1787,1,0,...,0.0,0.0,0.0,0.0,1.0
1,33,0,4789,0,1,...,0.0,1.0,0.0,0.0,0.0
2,35,0,1350,0,0,...,0.0,1.0,0.0,0.0,0.0
3,30,0,1476,0,1,...,0.0,0.0,0.0,0.0,1.0
4,59,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
4516,33,0,-333,0,0,...,0.0,0.0,0.0,0.0,1.0
4517,57,1,-3313,0,1,...,0.0,0.0,0.0,0.0,1.0
4518,57,0,295,1,0,...,0.0,0.0,0.0,0.0,1.0
4519,28,0,1137,1,0,...,0.0,0.0,1.0,0.0,0.0


## Question 4: What should be your baseline for these dummy variables (namely, which dummy variables should you not include in your model)?

In [10]:
df.columns

Index([u'age', u'default', u'balance', u'housing', u'loan', u'duration',
       u'campaign', u'pdays', u'previous', u'y', u'default_no', u'housing_no',
       u'loan_no', u'y_no', u'job_admin.', u'job_blue-collar',
       u'job_entrepreneur', u'job_housemaid', u'job_management',
       u'job_retired', u'job_self-employed', u'job_services', u'job_student',
       u'job_technician', u'job_unemployed', u'job_unknown',
       u'marital_divorced', u'marital_married', u'marital_single',
       u'education_primary', u'education_secondary', u'education_tertiary',
       u'education_unknown', u'contact_cellular', u'contact_telephone',
       u'contact_unknown', u'month_apr', u'month_aug', u'month_dec',
       u'month_feb', u'month_jan', u'month_jul', u'month_jun', u'month_mar',
       u'month_may', u'month_nov', u'month_oct', u'month_sep',
       u'poutcome_failure', u'poutcome_other', u'poutcome_success',
       u'poutcome_unknown'],
      dtype='object')

Answer: The month dummy variables probably have no impact on results since there is no reason why that should have any impact on y. Also, if we're using a model with dummy variables, we don't need every dummy variable since one of the values is extraneous, so we can drop the original columns that created the dummy variables as well as the first dummy variable in each set of dummy variables

## Question 5: What input variable in the dataset seems to predict the outcome quite well.  Why?

In [11]:
modelAge = smf.ols(formula = 'y ~ age', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,9.207
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,0.00242
Time:,21:15:22,Log-Likelihood:,-1249.3
No. Observations:,4521,AIC:,2503.0
Df Residuals:,4519,BIC:,2515.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.0592,0.019,3.103,0.002,0.022 0.097
age,0.0014,0.000,3.034,0.002,0.000 0.002

0,1,2,3
Omnibus:,2038.978,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7070.73
Skew:,2.404,Prob(JB):,0.0
Kurtosis:,6.798,Cond. No.,171.0


In [12]:
modelAge = smf.ols(formula = 'y ~ default', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.007668
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,0.93
Time:,21:15:23,Log-Likelihood:,-1253.9
No. Observations:,4521,AIC:,2512.0
Df Residuals:,4519,BIC:,2525.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1152,0.005,24.045,0.000,0.106 0.125
default,0.0032,0.037,0.088,0.930,-0.069 0.076

0,1,2,3
Omnibus:,2044.343,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7107.435
Skew:,2.41,Prob(JB):,0.0
Kurtosis:,6.808,Cond. No.,7.78


In [13]:
modelAge = smf.ols(formula = 'y ~ balance', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.449
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,0.229
Time:,21:15:23,Log-Likelihood:,-1253.2
No. Observations:,4521,AIC:,2510.0
Df Residuals:,4519,BIC:,2523.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1125,0.005,21.423,0.000,0.102 0.123
balance,1.9e-06,1.58e-06,1.204,0.229,-1.19e-06 4.99e-06

0,1,2,3
Omnibus:,2043.134,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7098.447
Skew:,2.409,Prob(JB):,0.0
Kurtosis:,6.805,Cond. No.,3680.0


In [14]:
modelAge = smf.ols(formula = 'y ~ loan', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,22.58
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,2.07e-06
Time:,21:15:24,Log-Likelihood:,-1242.6
No. Observations:,4521,AIC:,2489.0
Df Residuals:,4519,BIC:,2502.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1248,0.005,24.244,0.000,0.115 0.135
loan,-0.0626,0.013,-4.752,0.000,-0.088 -0.037

0,1,2,3
Omnibus:,2025.876,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6967.602
Skew:,2.39,Prob(JB):,0.0
Kurtosis:,6.759,Cond. No.,2.85


In [15]:
modelAge = smf.ols(formula = 'y ~ duration', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.161
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,866.5
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,2.15e-174
Time:,21:15:24,Log-Likelihood:,-857.36
No. Observations:,4521,AIC:,1719.0
Df Residuals:,4519,BIC:,1732.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0149,0.006,-2.399,0.016,-0.027 -0.003
duration,0.0005,1.67e-05,29.436,0.000,0.000 0.001

0,1,2,3
Omnibus:,1707.891,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5646.209
Skew:,1.953,Prob(JB):,0.0
Kurtosis:,6.836,Cond. No.,528.0


In [16]:
modelAge = smf.ols(formula = 'y ~ campaign', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,16.96
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,3.89e-05
Time:,21:15:25,Log-Likelihood:,-1245.4
No. Observations:,4521,AIC:,2495.0
Df Residuals:,4519,BIC:,2508.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1328,0.006,20.833,0.000,0.120 0.145
campaign,-0.0063,0.002,-4.118,0.000,-0.009 -0.003

0,1,2,3
Omnibus:,2030.203,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6997.906
Skew:,2.395,Prob(JB):,0.0
Kurtosis:,6.768,Cond. No.,5.77


In [17]:
modelAge = smf.ols(formula = 'y ~ pdays', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,49.5
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,2.29e-12
Time:,21:15:25,Log-Likelihood:,-1229.3
No. Observations:,4521,AIC:,2463.0
Df Residuals:,4519,BIC:,2475.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1020,0.005,20.073,0.000,0.092 0.112
pdays,0.0003,4.72e-05,7.035,0.000,0.000 0.000

0,1,2,3
Omnibus:,2013.13,Durbin-Watson:,1.955
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6897.309
Skew:,2.374,Prob(JB):,0.0
Kurtosis:,6.752,Cond. No.,116.0


In [18]:
modelAge = smf.ols(formula = 'y ~ previous', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,62.41
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,3.48e-15
Time:,21:15:26,Log-Likelihood:,-1222.9
No. Observations:,4521,AIC:,2450.0
Df Residuals:,4519,BIC:,2463.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1033,0.005,20.853,0.000,0.094 0.113
previous,0.0220,0.003,7.900,0.000,0.017 0.027

0,1,2,3
Omnibus:,2002.222,Durbin-Watson:,1.956
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6842.626
Skew:,2.359,Prob(JB):,0.0
Kurtosis:,6.751,Cond. No.,1.94


In [19]:
modelAge = smf.ols(formula = 'y ~ age + loan + duration + campaign + pdays + previous', data = df).fit()
modelAge.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.182
Model:,OLS,Adj. R-squared:,0.181
Method:,Least Squares,F-statistic:,167.1
Date:,"Wed, 08 Jun 2016",Prob (F-statistic):,1.95e-192
Time:,21:15:27,Log-Likelihood:,-800.42
No. Observations:,4521,AIC:,1615.0
Df Residuals:,4514,BIC:,1660.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0700,0.019,-3.767,0.000,-0.106 -0.034
age,0.0014,0.000,3.411,0.001,0.001 0.002
loan,-0.0571,0.012,-4.774,0.000,-0.081 -0.034
duration,0.0005,1.66e-05,29.447,0.000,0.000 0.001
campaign,-0.0023,0.001,-1.666,0.096,-0.005 0.000
pdays,0.0002,5.27e-05,3.142,0.002,6.23e-05 0.000
previous,0.0145,0.003,4.651,0.000,0.008 0.021

0,1,2,3
Omnibus:,1595.136,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4977.825
Skew:,1.836,Prob(JB):,0.0
Kurtosis:,6.598,Cond. No.,1620.0


Answer: Age, loan, duration, campaign, pdays, previous all individually are statistically singificant in predicting the outcome. When modeled together, all of these except campaign are still significant. 

## Question 6: Split the dataset into a training set (60%) and a testing set (the rest)

In [20]:
train_df = df.sample(frac = .6, random_state = 0).sort()
train_df

  if __name__ == '__main__':


Unnamed: 0,age,default,balance,housing,loan,...,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
1,33,0,4789,0,1,...,0.0,1.0,0.0,0.0,0.0
2,35,0,1350,0,0,...,0.0,1.0,0.0,0.0,0.0
4,59,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0
5,35,0,747,1,0,...,0.0,1.0,0.0,0.0,0.0
6,36,0,307,0,0,...,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4513,49,0,322,1,0,...,0.0,0.0,0.0,0.0,1.0
4515,32,0,473,0,0,...,0.0,0.0,0.0,0.0,1.0
4516,33,0,-333,0,0,...,0.0,0.0,0.0,0.0,1.0
4517,57,1,-3313,0,1,...,0.0,0.0,0.0,0.0,1.0


In [21]:
test_df = df.drop(train_df.index)
test_df

Unnamed: 0,age,default,balance,housing,loan,...,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,1787,1,0,...,0.0,0.0,0.0,0.0,1.0
3,30,0,1476,0,1,...,0.0,0.0,0.0,0.0,1.0
7,39,0,147,0,0,...,0.0,0.0,0.0,0.0,1.0
12,36,0,1109,1,0,...,0.0,0.0,0.0,0.0,1.0
16,56,0,4073,1,0,...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
4508,42,0,642,0,1,...,0.0,0.0,0.0,0.0,1.0
4509,51,0,2506,1,0,...,0.0,0.0,0.0,0.0,1.0
4514,38,0,1205,0,0,...,0.0,1.0,0.0,0.0,0.0
4518,57,0,295,1,0,...,0.0,0.0,0.0,0.0,1.0


## Question 7: Run a logistic regression with `age`, `marital` (the dummies), `default`, `balance`, `housing`, `loan`, `campaign`, `pdays`, `previous`?

In [22]:
X = train_df[['age', 'marital_divorced', 'marital_married', 'marital_single', 'default', 'balance', 'housing', 'loan', 'campaign', 'pdays', 'previous']]
model = linear_model.LogisticRegression()
model.fit(X, train_df.y)
print model.coef_
print model.intercept_
print 2.72**model.coef_

[[  1.14103273e-02  -4.89231027e-01  -1.01308413e+00  -4.78056638e-01
   -2.76511247e-02   2.42033210e-05   6.66138684e-01  -7.17604536e-01
   -5.87307447e-02   1.66121869e-03   7.67156289e-02]]
[-1.9803718]
[[ 1.01148297  0.61290817  0.36286505  0.61979983  0.97271067  1.00002422
   1.94752552  0.48769846  0.94292564  1.00166365  1.07978733]]


In [23]:
model.score(X, train_df.y)

0.88315517876889049

## Question 8: What is your training error?  What is your generalization error?  Does it make sense?

In [24]:
testX = test_df[['age', 'marital_divorced', 'marital_married', 'marital_single', 'default', 'balance', 'housing', 'loan', 'campaign', 'pdays', 'previous']]
model.score(testX, test_df.y)

0.88550884955752207

Answer: Both training and generalization error is around 21-22%. This makes sense that the numbers match very well, since our test data was selected randomly from a large data set

## Question 9: Interpret your coefficients. (At least `marital_single`, `campaign`, and `default`).  Does your interpretation  make sense?

Answer: e^Bj represents the multiplier change in odds of y being 1 given a 1 unit change in xj. This means that the odds of y being 1 goes down by 38% when the person is single. The odds of y being 1 goes down by 5.7% each day increase in campaign. The odds of y being 1 goes down by 2.8% if the person has credit in defualt 

## Question 10: What is your prediction for a 30 years old single female, a homeowner with a \$1,000 balance in the bank, without a loan, who has never been contacted before, and who has never defaulted.

In [25]:
unknown = {'age' : 30, 'marital_divorced': 0, 'marital_married': 0, 'marital_single' : 1, 'default' : 0, 'balance' : 1000, 'housing': 1, 'loan': 0, 'campaign': 0, 'pdays' : 999, 'previous': 0}
unknown = pd.DataFrame.from_dict(data = unknown, orient = 'index').transpose() #Create a dataframe of the test data
print 'predicted class of y:',model.predict(unknown)

predicted class of y: [0]


Answer: 0

## Question 11: Normalize your variables.  (You can reuse the function from the previous lab)

In [26]:
def normalize(x):
    df[x] = (df[x] - df[x].min()) / (df[x].max() - df[x].min())
for i in df.columns:
    normalize(i)
df

Unnamed: 0,age,default,balance,housing,loan,...,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0.161765,0.0,0.068455,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0
1,0.205882,0.0,0.108750,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0
2,0.235294,0.0,0.062590,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0
3,0.161765,0.0,0.064281,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0
4,0.588235,0.0,0.044469,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
4516,0.205882,0.0,0.039999,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0
4517,0.558824,1.0,0.000000,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0
4518,0.558824,0.0,0.048429,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0
4519,0.132353,0.0,0.059731,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0


## Question 12: Let's do some regularization.  Use 10-fold cross validation to find the best tuning parameter `c`

(Hint: check the documentation here: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

Note: `c` is just the inverse of $\lambda$ - the smaller $c$, the stronger the regularization. The smaller values choose less variables

(Hint 2: First try c = 10 ^ i with i = -10 ... 10)

In [None]:
k_cv = 10
k_nn = range(1, len(train_df) * (k_cv - 1) / k_cv) # k-NN

gs = grid_search.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn},
    cv = cross_validation.KFold(len(train_df), n_folds = k_cv)
)

gs.fit(train_X, train_y)

score_df = pd.DataFrame({'k': [score.parameters['n_neighbors'] for score in gs.grid_scores_],
    'Score': [score.mean_validation_score for score in gs.grid_scores_]})

Answer:

## Question 13: Now use the best `c` you found above and repeat your analysis; look over your coefficients

The coefficients are good

## Question 14: If you want to drop 3 variables from your analysis, which variables will you choose?

Answer: age, balance, and pdays