In [1]:
### regularization with linear regression
# read in data, remove categroical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:,5:]
crime.dropna(inplace=True)
crime.head()

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,118,119,120,121,122,123,124,125,126,127
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
16,0.15,0.31,0.4,0.63,0.14,0.06,0.58,0.72,0.65,0.47,...,0.06,0.39,0.84,0.06,0.06,0.91,0.5,0.88,0.26,0.49
20,0.25,0.54,0.05,0.71,0.48,0.3,0.42,0.48,0.28,0.32,...,0.09,0.46,0.05,0.09,0.05,0.88,0.5,0.76,0.13,0.34
21,1.0,0.42,0.47,0.59,0.12,0.05,0.41,0.53,0.34,0.33,...,1.0,0.07,0.15,1.0,0.35,0.73,0.0,0.31,0.21,0.69
23,0.11,0.43,0.04,0.89,0.09,0.06,0.45,0.48,0.31,0.46,...,0.16,0.12,0.07,0.04,0.01,0.81,1.0,0.56,0.09,0.63


In [2]:
# define X and y
X = crime.iloc[:,:-1]
y = crime.iloc[:,-1]

In [3]:
# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [4]:
### linear regression model with regularization
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
lm.coef_

array([ -3.66188167e+00,   6.98124465e-01,  -2.61955467e-01,
        -2.85270027e-01,  -1.64740837e-01,   2.46972333e-01,
        -1.09290051e+00,  -5.96857796e-01,   1.11200239e+00,
        -7.21968931e-01,   4.27346598e+00,  -2.28040268e-01,
         8.04875769e-01,  -2.57934732e-01,  -2.63458023e-01,
        -1.04616958e+00,   6.07784197e-01,   7.73552561e-01,
         5.96468029e-02,   6.90215922e-01,   2.16759430e-02,
        -4.87802949e-01,  -5.18858404e-01,   1.39478815e-01,
        -1.24417942e-01,   3.15003821e-01,  -1.52633736e-01,
        -9.65003927e-01,   1.17142163e+00,  -3.08546690e-02,
        -9.29085548e-01,   1.24654586e-01,   1.98104506e-01,
         7.30804821e-01,  -1.77337294e-01,   8.32927588e-02,
         3.46045601e-01,   5.01837338e-01,   1.57062958e+00,
        -4.13478807e-01,   1.39350802e+00,  -3.49428114e+00,
         7.09577818e-01,  -8.32141352e-01,  -1.39984927e+00,
         1.02482840e+00,   2.13855006e-01,  -6.18937325e-01,
         5.28954490e-01,

In [5]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RSME (no regularization) = ', np.sqrt(metrics.mean_squared_error(y_test,preds))

RSME (no regularization) =  0.233813676495


In [7]:
### ridge regression model
# ridge regression (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train,y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test,preds))

 RMSE (Ridge reg.) = 0.164279068049


In [11]:
# use RidgeCV to select the best alpha
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-2,3)
rregcv = RidgeCV(normalize=True,scoring='mean_squared_error',alphas=alpha_range)
rregcv.fit(X_train,y_train)
rregcv.alpha_
preds = rregcv.predict(X_test)
print 'RMSE (Ridge CV reg.)', np.sqrt(metrics.mean_squared_error(y_test,preds))
print rregcv.alpha_

RMSE (Ridge CV reg.) 0.163129782343
1.0


In [9]:
### Lasso Regression Model
# lasso (alpha must be positive, larger means more regulation)
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01,normalize=True)
las.fit(X_train,y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test,preds))

RMSE (Lasso reg.) = 0.198165225429


In [10]:
# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
lascv = LassoCV(normalize=True, alphas=alpha_range)
lascv.fit(X_train,y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (Lasso CV reg.) = 0.198165225429


In [15]:
### regularization with logistic regression
# get and prepare data
titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT5/master/data/titanic_train.csv')
titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})
titanic.Age.fillna(titanic.Age.mean(), inplace=True)
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:]
titanic = pd.concat([titanic, embarked_dummies], axis=1)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,,S,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1,C123,S,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.05,,S,0,1


In [16]:
# define X and y
feature_cols = ['Pclass','Sex','Age','Embarked_Q','Embarked_S']
X = titanic[feature_cols]
y = titanic.Survived

In [17]:
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
# standardize our data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
### logistic regression model without regularization
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
logreg.coef_
y_pred = logreg.predict(X_test_scaled)




In [27]:
# Access accuracy
print 'Accuracy (no penalty) =', metrics.accuracy_score(y_test, y_pred)

Accuracy (no penalty) = 0.793721973094


In [28]:
### logistic regression with L1 penalty
logreg_l1 = LogisticRegression(C=0.1, penalty='l1')
logreg_l1.fit(X_train_scaled, y_train)
logreg_l1.coef_
y_pred_l1 = logreg_l1.predict(X_test_scaled)

In [29]:
# access accuracy
print 'Accuracy (L1 penalty) =', metrics.accuracy_score(y_test,y_pred_l1)

Accuracy (L1 penalty) = 0.784753363229


In [30]:
### Logistic Regression with L2 Penalty
# logistic regression
logreg_l2 = LogisticRegression(C=0.1,penalty='l2')
logreg_l2.fit(X_train_scaled,y_train)
logreg_l2.coef_
y_pred_l2 = logreg_l2.predict(X_test_scaled)

In [31]:
# access accuracy
print 'Accuracy (L2 penalty) =', metrics.accuracy_score(y_test,y_pred_l2)

Accuracy (L2 penalty) = 0.789237668161
