In [1]:
import dask.dataframe as dd
from dask.distributed import Client

from dask_ml.preprocessing import Categorizer, DummyEncoder, StandardScaler
from dask_ml.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline, make_pipeline


In [2]:
client = Client(n_workers=1, threads_per_worker=4, processes=False, memory_limit='2GB')
client

0,1
Client  Scheduler: inproc://10.4.90.92/8232/1  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 2.00 GB


In [3]:
#day = dd.read_csv('Bike-Sharing-Dataset/day.csv')
#hour = dd.read_csv('Bike-Sharing-Dataset/hour.csv')
day = dd.read_csv('https://gist.githubusercontent.com/louisdubaere/44d9cae2acbdb4c45144da05e62e677e/raw/dfbc17b394dbe3f966111fd4e2636b0184413b0c/day.csv')
hour = dd.read_csv('https://gist.githubusercontent.com/louisdubaere/7b76bb68cbde26bfd9fcfefe0d498b75/raw/4fe5ee53f3f329b902ded808a5367066c86f0cb9/hour.csv')

In [4]:
hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
hour.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 17 entries, instant to cnt
dtypes: object(1), float64(4), int64(12)

In [6]:
hour.describe().head(8)

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [7]:
hour.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

# Create Model

In [8]:
X = hour.drop(['casual', 'registered', 'cnt','dteday','instant','temp'], axis = 1)
y = hour['cnt']

In [9]:
length = client.submit(len, X).result()
train_n = round(0.8 * length)
X_train = X[0:train_n]
X_test = X[train_n+1:length]
y_train = y.loc[0:train_n]
y_test = y.loc[train_n+1:length]

In [10]:
categories = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
scale = ['atemp', 'hum', 'windspeed']

In [11]:
pipe = make_pipeline(Categorizer(columns = categories), DummyEncoder(), StandardScaler(scale), LinearRegression())

In [12]:
# simple Linear Regression
fitted_pipe = client.submit(pipe.fit, X_train, y_train)
score = client.submit(fitted_pipe.result().score, X_test, y_test)
score.result()

0.6336091686562537

## Try Ridge and Lasso Regression


In [13]:
lasso = Lasso()
ridge = Ridge()

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 2]}

ridge_regression = GridSearchCV(ridge, parameters, cv=5)
lasso_regression = GridSearchCV(lasso, parameters, cv=5)

pipe_ridge = make_pipeline(Categorizer(columns = categories), DummyEncoder(), StandardScaler(scale), ridge_regression)
pipe_lasso = make_pipeline(Categorizer(columns = categories), DummyEncoder(), StandardScaler(scale), lasso_regression)


In [14]:
fitted_ridge = client.submit(pipe_ridge.fit, X_train, y_train)
score = client.submit(fitted_ridge.result().score, X_test, y_test)
score.result()

0.6314663651907226

In [None]:
# Takes too long to run
fitted_lasso = client.submit(pipe_lasso.fit, X_train, y_train)
score = client.submit(fitted_lasso.result().score, X_test, y_test)
score.result()