# Exercise 2: Regularization

## Agenda

1. Regularized regression in scikit-learn
2. Regularized classification in scikit-learn

## Part 1: Regularized regression in scikit-learn

**Goal:** Predict the violent crime rate for a community given socioeconomic and law enforcement data ([description](http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names) , [data](http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data))

### Load and prepare the crime dataset

In [1]:
# read in the dataset
import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
crime = pd.read_csv(url, header=None, na_values=['?'])
crime.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,,,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.0,,0.67
2,24,,,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.0,,0.43
3,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,,,,,0.0,,0.12
4,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.0,,0.03


In [2]:
# examine the response variable
crime[127].describe()

count    1994.000000
mean        0.237979
std         0.232985
min         0.000000
25%         0.070000
50%         0.150000
75%         0.330000
max         1.000000
Name: 127, dtype: float64

In [3]:
# remove categorical features
crime.drop([0, 1, 2, 3, 4], axis=1, inplace=True)

In [4]:
# remove rows with any missing values
crime.dropna(inplace=True)

In [5]:
# check the shape
crime.shape

(319, 123)

In [6]:
# define X and y
X = crime.drop(127, axis=1)
y = crime[127]

In [7]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Linear regression

In [8]:
# build a linear regression model
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# examine the coefficients
print (linreg.coef_)

[ -3.66188167e+00   6.98124465e-01  -2.61955467e-01  -2.85270027e-01
  -1.64740837e-01   2.46972333e-01  -1.09290051e+00  -5.96857796e-01
   1.11200239e+00  -7.21968931e-01   4.27346598e+00  -2.28040268e-01
   8.04875769e-01  -2.57934732e-01  -2.63458023e-01  -1.04616958e+00
   6.07784197e-01   7.73552561e-01   5.96468029e-02   6.90215922e-01
   2.16759430e-02  -4.87802949e-01  -5.18858404e-01   1.39478815e-01
  -1.24417942e-01   3.15003821e-01  -1.52633736e-01  -9.65003927e-01
   1.17142163e+00  -3.08546690e-02  -9.29085548e-01   1.24654586e-01
   1.98104506e-01   7.30804821e-01  -1.77337294e-01   8.32927588e-02
   3.46045601e-01   5.01837338e-01   1.57062958e+00  -4.13478807e-01
   1.39350802e+00  -3.49428114e+00   7.09577818e-01  -8.32141352e-01
  -1.39984927e+00   1.02482840e+00   2.13855006e-01  -6.18937325e-01
   5.28954490e-01   7.98294890e-02   5.93688560e-02  -1.68582667e-01
   7.31264051e-01  -1.39635208e+00   2.38507704e-01   5.50621439e-01
  -5.61447867e-01   6.18989764e-01

In [10]:
# make predictions
y_pred = linreg.predict(X_test)

In [11]:
# calculate RMSE (root mean square error)
from sklearn import metrics
import numpy as np
print (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.233813676495


### Lasso regression

**alpha:** must be positive, increase for more regularization

**normalize:** scales the features (without using StandardScaler)

In [12]:
# try alpha=0.001 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(X_train, y_train)
print (lassoreg.coef_)

[ 0.          0.          0.00891952 -0.27423369  0.          0.          0.
 -0.         -0.          0.          0.          0.         -0.         -0.
 -0.         -0.19414627  0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.          0.          0.          0.
  0.04335664 -0.          0.         -0.          0.03491474 -0.
 -0.06685424  0.          0.         -0.          0.10575313  0.          0.
  0.00890807  0.         -0.1378172  -0.30954312 -0.         -0.         -0.
 -0.          0.          0.          0.          0.         -0.          0.
  0.          0.          0.          0.          0.         -0.          0.
  0.          0.         -0.          0.         -0.         -0.          0.
  0.05257892 -0.          0.         -0.         -0.          0.          0.
  0.          0.          0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.         -0.          0.
  0.1386108

In [13]:
# try alpha=0.01 and examine coefficients
lassoreg = Lasso(alpha=0.01, normalize=True)
lassoreg.fit(X_train, y_train)
print (lassoreg.coef_)

[ 0.          0.          0.         -0.03974695  0.          0.          0.
  0.          0.         -0.          0.          0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.          0.
  0.          0.          0.         -0.          0.         -0.         -0.
  0.          0.         -0.          0.          0.          0.          0.
  0.         -0.         -0.27503063 -0.         -0.         -0.         -0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.          0.          0.
  0.          0.          0.          0.         -0.          0.          0.
 -0.          0.         -0.         -0.          0.          0.         -0.
  0.          0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.         -0.          0.          0.

In [14]:
# calculate RMSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.198165225429


**LassoCV:** lasso regression with built-in cross-validation of the alpha parameter

**n_alphas:** number of alpha values (automatically chosen) to try

In [15]:
# select the best alpha with LassoCV
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
lassoregcv.alpha_



0.0015161594598125873

In [16]:
# examine the coefficients
print (lassoregcv.coef_)

[ 0.          0.          0.         -0.28113506  0.          0.          0.
  0.          0.          0.          0.          0.         -0.         -0.
 -0.         -0.15481092  0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.          0.         -0.          0.
  0.06451487  0.          0.         -0.          0.         -0.
 -0.01920421  0.          0.         -0.          0.03386202  0.          0.
  0.08901243  0.         -0.08759757 -0.36986917 -0.         -0.         -0.
 -0.          0.          0.          0.          0.         -0.          0.
  0.          0.          0.          0.          0.         -0.          0.
  0.          0.         -0.          0.          0.         -0.          0.
  0.01740599 -0.          0.         -0.         -0.          0.          0.
  0.          0.          0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.         -0.          0.
  0.1347103

In [17]:
# predict method uses the best alpha value
y_pred = lassoregcv.predict(X_test)
print (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.160209558014


## Part 2: Regularized classification in scikit-learn

**Goal:** Predict the origin of wine using chemical analysis ([description](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names) , [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data))

### Load and prepare the wine dataset

In [18]:
# read in the dataset
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(url, header=None)
wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [19]:
# examine the response variable
wine[0].value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

In [20]:
# define X and y
X = wine.drop(0, axis=1)
y = wine[0]

In [21]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Logistic regression (unregularized)

In [22]:
# build a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [23]:
# examine the coefficients
print (logreg.coef_)

[[ -2.96611429e+00   3.76016086e+00   1.03739068e+01  -2.02233635e+00
    6.75105692e-02  -3.40242953e+00   7.40949474e+00   2.02740610e+00
   -7.42469111e+00  -1.63380461e+00  -1.33305103e+00   4.76239817e+00
    4.83827932e-02]
 [  4.45917818e+00  -4.48806626e+00  -1.22444187e+01   1.26788943e+00
   -1.85613537e-02   2.68433491e+00   3.64057127e+00   1.86884959e+00
    5.01005936e+00  -6.12009386e+00   2.90174583e+00  -6.25483654e+00
   -4.03486220e-02]
 [ -9.70202100e-01   2.08267742e+00   9.41533469e-01   2.38031750e-01
   -2.49570501e-03  -9.80986332e-01  -6.54894059e+00  -4.83300999e-01
   -2.65887484e+00   2.57456803e+00  -1.30417741e+00  -2.34301026e+00
    9.48511805e-03]]


In [24]:
# generate predicted probabilities
y_pred_prob = logreg.predict_proba(X_test)
print (y_pred_prob)

[[  1.54354875e-07   4.81872319e-08   9.99999797e-01]
 [  1.89412176e-12   1.00000000e+00   4.32286226e-10]
 [  9.99999902e-01   3.95673409e-09   9.37496136e-08]
 [  1.23925454e-06   9.99998637e-01   1.24120109e-07]
 [  9.99730906e-01   5.03457974e-17   2.69093680e-04]
 [  1.85760707e-10   1.99914662e-05   9.99980008e-01]
 [  9.99661315e-01   1.79283959e-04   1.59401444e-04]
 [  9.99999720e-01   8.43071875e-19   2.79993090e-07]
 [  3.04554840e-12   2.73618715e-10   1.00000000e+00]
 [  7.02538797e-12   9.99997008e-01   2.99234252e-06]
 [  9.99967668e-01   5.38832610e-10   3.23315305e-05]
 [  9.21469774e-01   7.85299972e-02   2.29075629e-07]
 [  2.62460749e-14   1.00000000e+00   1.12565311e-10]
 [  9.99999998e-01   8.63011598e-11   1.72563981e-09]
 [  1.33575013e-06   9.99998664e-01   3.47692519e-10]
 [  5.20167742e-11   9.99999959e-01   4.13246885e-08]
 [  7.77079386e-21   4.96471138e-01   5.03528862e-01]
 [  9.99626597e-01   3.73376179e-04   2.68568769e-08]
 [  2.40752340e-10   9.99999

In [25]:
# calculate log loss
print (metrics.log_loss(y_test, y_pred_prob))

0.295952657137


### Logistic regression (regularized)

**C:** must be positive, decrease for more regularization

**penalty:** l1 (lasso)

In [26]:
# standardize X_train and X_test
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
# try C=0.1 with L1 penalty
logreg = LogisticRegression(C=0.1, penalty='l1')
logreg.fit(X_train_scaled, y_train)
print (logreg.coef_)

[[ 0.21042049  0.          0.          0.          0.          0.
   0.4876934   0.          0.          0.          0.          0.15304469
   1.47741129]
 [-0.65718438 -0.05609679 -0.1139552   0.          0.          0.          0.
   0.          0.         -0.73819248  0.24414572  0.         -0.63401931]
 [ 0.          0.          0.          0.          0.          0.
  -0.84161272  0.          0.          0.61526895 -0.49025432 -0.30489546
   0.        ]]


In [28]:
# generate predicted probabilities and calculate log loss
y_pred_prob = logreg.predict_proba(X_test_scaled)
print (metrics.log_loss(y_test, y_pred_prob))

0.362247453831


**Pipeline:** chain steps together

In [29]:
# pipeline of StandardScaler and LogisticRegression
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

**GridSearchCV:** search a grid of parameters

In [30]:
# grid search for best combination of C and penalty
from sklearn.grid_search import GridSearchCV
C_range = 10.**np.arange(-2, 3)
penalty_options = ['l1']
param_grid = dict(logisticregression__C=C_range, logisticregression__penalty=penalty_options)
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='log_loss')
grid.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02]), 'logisticregression__penalty': ['l1']},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [31]:
# print all log loss scores
grid.grid_scores_

[mean: -1.09861, std: 0.00000, params: {'logisticregression__C': 0.01, 'logisticregression__penalty': 'l1'},
 mean: -0.35491, std: 0.06890, params: {'logisticregression__C': 0.10000000000000001, 'logisticregression__penalty': 'l1'},
 mean: -0.09434, std: 0.06112, params: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l1'},
 mean: -0.05805, std: 0.06373, params: {'logisticregression__C': 10.0, 'logisticregression__penalty': 'l1'},
 mean: -0.07518, std: 0.10056, params: {'logisticregression__C': 100.0, 'logisticregression__penalty': 'l1'}]

In [32]:
# examine the best model
print (grid.best_score_)
print (grid.best_params_)

-0.058052481228
{'logisticregression__C': 10.0, 'logisticregression__penalty': 'l1'}
