In [11]:
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, cross_validation

In [12]:
# define functions in one cell, call when you need
def read_csv(path):
    return pd.read_csv(path)

In [13]:
path = '../../assets/datasets/bikeshare.csv'
bikeshare = read_csv(path)

In [14]:
bikeshare.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [15]:
bikeshare.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [16]:
bikeshare.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0,0,1,1


In [17]:
weather = pd.get_dummies(bikeshare.weathersit, prefix='weather')
feature_cols = ['temp', 'hum']
modeldata = bikeshare[feature_cols].join(
    weather[['weather_1', 'weather_2', 'weather_3']]
)
y = bikeshare.casual

In [18]:
kf = cross_validation.KFold(len(modeldata), n_folds=5, shuffle=True)

In [19]:
lm_obj = linear_model.LinearRegression() # instaniate only one time
scores = [] # we will append mse scores from each iteration in kf
for train_index, test_index in kf: # for (key,value) in dictionary
    x_train = modeldata.iloc[train_index] # get new set each iteration
    y_train = y.iloc[train_index]
    
    x_test = modeldata.iloc[test_index] # get new test data each iteration
    y_test = y.iloc[test_index]
        
    lm = lm_obj.fit(x_train, y_train) # fit new model each iteration
    x_test_pred = lm.predict(x_test)

    mse = metrics.mean_squared_error(y_test,x_test_pred) 
    # get new mse each iteration
    scores.append(mse) 
    # append mse scores from each model to scores list

In [20]:
print np.mean(scores) # get the mean of mse scores from each iteration

1673.6009276


In [21]:
# this score will be lower, 
# but we're trading off bias error for generalized error
lm_1 = linear_model.LinearRegression().fit(modeldata, y)
y_est_1 = lm_1.predict(modeldata) # note overfitting predicting with same model data
print metrics.mean_squared_error(y, y_est_1)

1672.58110765


In [22]:
lm_2 = linear_model.Lasso().fit(modeldata, y)
y_est_2 = lm_2.predict(modeldata)
print metrics.mean_squared_error(y, y_est_2)

1725.41581608


In [23]:
lm_3 = linear_model.Ridge().fit(modeldata, y)
y_est_3 = lm_3.predict(modeldata)
print metrics.mean_squared_error(y, y_est_3)

1672.60490113
