# Guided Practice/Demo

The following code samples are provided directly from the lesson and should serve as a jumping off point for students to run the code on their own.

In [40]:
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics

df = pd.DataFrame({'x': range(100), 'y': range(100)})
biased_df  = df.copy()
biased_df.loc[:20, 'x'] = 1
biased_df.loc[:20, 'y'] = 1

df.head(100)
biased_df.head(100)

def append_jitter(series):
   jitter = np.random.random_sample(size=100)
   return series + jitter

# Series( [1,2,3]), [2,3,4] => Series( [1,2,3,2,3,4] )

df['x'] = append_jitter(df.x)
df['y'] = append_jitter(df.y)

biased_df['x'] = append_jitter(biased_df.x)
biased_df['y'] = append_jitter(biased_df.y)



## fit
lm = linear_model.LinearRegression().fit(df[['x']], df['y'])
print metrics.mean_squared_error(df['y'], lm.predict(df[['x']]))

## biased fit will always be higher because of the first 20 locations are 1 and higher
lm = linear_model.LinearRegression().fit(biased_df[['x']], biased_df['y'])
print metrics.mean_squared_error(df['y'], lm.predict(df[['x']]))

0.134878849402
0.141752281035


In [36]:
print biased_df.head(30)
print df.head(30)

            x          y
0    1.696415   1.654652
1    1.099702   1.375220
2    1.510861   1.794541
3    1.969568   1.186030
4    1.295236   1.297799
5    1.041796   1.236327
6    1.007597   1.368812
7    1.725729   1.939203
8    1.674746   1.949590
9    1.713017   1.344209
10   1.298806   1.279035
11   1.626237   1.657009
12   1.327749   1.483656
13   1.036698   1.167385
14   1.195128   1.128646
15   1.264888   1.466548
16   1.300450   1.641377
17   1.472034   1.437196
18   1.647136   1.814267
19   1.888323   1.448401
20   1.124959   1.529168
21  21.148196  21.098311
22  22.077154  22.552374
23  23.761553  23.254318
24  24.577879  24.671973
25  25.098966  25.107203
26  26.096067  26.199726
27  27.747991  27.976180
28  28.868522  28.837590
29  29.690261  29.182976
            x          y
0    0.445332   0.500281
1    1.102689   1.620778
2    2.352411   2.715300
3    3.006331   3.859952
4    4.487518   4.303899
5    5.274434   5.783733
6    6.092070   6.193411
7    7.637340   7.723053


In [42]:
from sklearn import cross_validation
wd = '../../assets/dataset/'
bikeshare = pd.read_csv(wd + 'bikeshare.csv')
weather = pd.get_dummies(bikeshare.weathersit, prefix='weather')
modeldata = bikeshare[['temp', 'hum']].join(weather[['weather_1', 'weather_2', 'weather_3']])

bikeshare.describe()
weather.head()
modeldata.head()

y = bikeshare.casual

kf = cross_validation.KFold(len(modeldata), n_folds=5, shuffle=True)
scores = []
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    scores.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))

print np.mean(scores)

# this score will be lower, but we're trading off bias error for generalized error taking the 
# average of larger sets
lm = linear_model.LinearRegression().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

1673.72373533
1672.58110765


In [58]:
for fold in range(2,201,5):
    start_validation(fold)

1662.16173645
1673.61336177
1627.0421578
1674.12603625
1667.33108956
1654.63008859
1647.41246285
1660.09393348
1673.11404302
1767.3385275
1633.03567868
1674.13195178
1648.20660035
1700.19517386
1677.28068516
1656.29428691
1636.06594633
1648.65391007
1663.66309447
1659.8102078
1673.29063491
1638.82392643
1662.90137517
1769.57251638
1722.89087627
1659.72279247
1673.72600056
1660.11000491
1692.75601246
1694.06035687
1684.45830425
1690.44858918
1700.10996978
1699.29935999
1676.62129657
1680.24360878
1680.17183639
1673.34466394
1857.55747613
1650.91121973
1727.98148923
1770.35255398
1747.60820306
1727.79800748
1703.51930067
1697.425683
1703.91842214
1694.01312661
1687.14912474
1699.75648348
1703.23219881
1673.58190314
1664.27841696
1656.05901981
1644.71145859
1656.41440538
1665.61513695
1666.92653231
1669.54454824
1673.47220499
1766.55840504
1870.04080482
1727.9526048
1740.72604093
1754.24564941
1740.67006878
1752.21709045
1765.12372489
1773.07344741
1738.61159416
1716.73697268
1717.3832931

In [59]:
def start_validation(fold):
    kf = cross_validation.KFold(len(modeldata), n_folds=fold)
    scores = []
    for train_index, test_index in kf:
        lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
        scores.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))
        print np.mean(scores)


In [61]:
lm = linear_model.LinearRegression().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))
lm = linear_model.Lasso().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))
lm = linear_model.Ridge().fit(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

1672.58110765
1725.41581608
1672.60490113


In [62]:
modeldata.shape
## Use ridget when you have more observations than features like this data set, reduces bias 
## and added more variances, by using L2 ridge, 1672.604

(17379, 5)

In [68]:
alphas = np.logspace(-10, 10, 21)
for a in alphas:
    print 'Alpha:', a
    lm = linear_model.Ridge(alpha=a)
    lm.fit(modeldata, y)
    print lm.coef_
    print metrics.mean_squared_error(y, lm.predict(modeldata))

Alpha: 1e-10
[ 112.68901765  -84.01121684  -24.68489063  -21.00314493  -21.71893628]
1672.58110765
Alpha: 1e-09
[ 112.68901765  -84.01121684  -24.68489061  -21.00314491  -21.71893626]
1672.58110765
Alpha: 1e-08
[ 112.68901765  -84.01121684  -24.6848904   -21.00314471  -21.71893606]
1672.58110765
Alpha: 1e-07
[ 112.68901763  -84.01121682  -24.68488837  -21.00314268  -21.71893403]
1672.58110765
Alpha: 1e-06
[ 112.68901745  -84.01121667  -24.68486804  -21.00312237  -21.71891373]
1672.58110765
Alpha: 1e-05
[ 112.68901562  -84.01121509  -24.68466472  -21.00291929  -21.71871079]
1672.58110765
Alpha: 0.0001
[ 112.68899732  -84.01119938  -24.68263174  -21.00088873  -21.71668162]
1672.58110765
Alpha: 0.001
[ 112.68881437  -84.01104228  -24.66232204  -20.98060316  -21.69640993]
1672.58110774
Alpha: 0.01
[ 112.68698753  -84.00947323  -24.46121539  -20.77973778  -21.49568404]
1672.58111645
Alpha: 0.1
[ 112.66896732  -83.99396383  -22.63109556  -18.95202277  -19.66942371]
1672.58185208
Alpha: 1.0
[

In [77]:
from sklearn import grid_search

alphas = np.logspace(-10, 10, 21)
gs = grid_search.GridSearchCV(
    estimator=linear_model.Ridge(),
    param_grid={'alpha': alphas},
    scoring='mean_squared_error')

gs.fit(modeldata, y)

print -gs.best_score_ # mean squared error here comes in negative, so let's make it positive.
print gs.best_estimator_ # explains which grid_search setup worked best
print gs.grid_scores_ # shows all the grid pairings and their performances.

1814.09369133
Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
[mean: -1817.58711, std: 542.14315, params: {'alpha': 1e-10}, mean: -1817.58711, std: 542.14315, params: {'alpha': 1.0000000000000001e-09}, mean: -1817.58711, std: 542.14315, params: {'alpha': 1e-08}, mean: -1817.58711, std: 542.14315, params: {'alpha': 9.9999999999999995e-08}, mean: -1817.58711, std: 542.14315, params: {'alpha': 9.9999999999999995e-07}, mean: -1817.58711, std: 542.14317, params: {'alpha': 1.0000000000000001e-05}, mean: -1817.58707, std: 542.14331, params: {'alpha': 0.0001}, mean: -1817.58663, std: 542.14477, params: {'alpha': 0.001}, mean: -1817.58230, std: 542.15933, params: {'alpha': 0.01}, mean: -1817.54318, std: 542.30102, params: {'alpha': 0.10000000000000001}, mean: -1817.20111, std: 543.63587, params: {'alpha': 1.0}, mean: -1814.09369, std: 556.35563, params: {'alpha': 10.0}, mean: -1818.51694, std: 653.68607, params: 

In [73]:
print alphas

[  1.00000000e-10   1.00000000e-09   1.00000000e-08   1.00000000e-07
   1.00000000e-06   1.00000000e-05   1.00000000e-04   1.00000000e-03
   1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02   1.00000000e+03   1.00000000e+04   1.00000000e+05
   1.00000000e+06   1.00000000e+07   1.00000000e+08   1.00000000e+09
   1.00000000e+10]


In [81]:
num_to_approach, start, steps, optimized = 3.2, 0., [-1, 1], False
while not optimized:
    current_distance = num_to_approach - start
    got_better = False
    next_steps = [start + i for i in steps]
    for n in next_steps:
        distance = np.abs(num_to_approach - n)
        if distance < current_distance:
            got_better = True
            print distance, 'is better than', current_distance
            current_distance = distance
            start = n
    if got_better:
        print 'found better solution! using', current_distance
        a += 1
    else:
        optimized = True
        print start, 'is closest to', num_to_approach


2.2 is better than 3.2
found better solution! using 2.2
1.2 is better than 2.2
found better solution! using 1.2
0.2 is better than 1.2
found better solution! using 0.2
3.0 is closest to 3.2


In [79]:
lm = linear_model.SGDRegressor()
lm.fit(modeldata, y)
print lm.score(modeldata, y)
print metrics.mean_squared_error(y, lm.predict(modeldata))

0.308508468911
1680.90951975


# Independent Practice

Use the following code to work through the problems given.

In [84]:
params = {} # put your gradient descent parameters here
gs = grid_search.GridSearchCV(
    estimator=linear_model.SGDRegressor(),
    cv=cross_validation.KFold(len(modeldata), n_folds=7, shuffle=True),
    param_grid=params,
    scoring='mean_squared_error',
    )

gs.fit(modeldata, y)

print 'BEST ESTIMATOR'
print -gs.best_score_
print gs.best_estimator_
print 'ALL ESTIMATORS'
print gs.grid_scores_

BEST ESTIMATOR
1686.26690474
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)
ALL ESTIMATORS
[mean: -1686.26690, std: 154.40775, params: {}]
