In [1]:
import pandas as pd
from dask import dataframe as dd
from dask_ml.linear_model import LinearRegression
from dask_ml.preprocessing import Categorizer, DummyEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from dask_ml.metrics import mean_squared_error
from dask_ml.metrics import r2_score
import dask_ml
from distributed import Client, progress

In [None]:
#client = Client()
#client

In [2]:
ddf=dd.read_csv(
    "https://gist.githubusercontent.com/geraldwal/b5a83f4c670abe0a662abce558e5d433/raw/bce4bbfc63355606e4503964e25798b5d2190b9b/hour%2520-%2520Python%2520Bike%2520Sharing",
    sep=",",
    parse_dates=["dteday"])

In [3]:
ddf = ddf.rename(
    columns={
        "weathersit": "weather",
        "mnth": "month",
        "hr": "hour",
        "hum": "humidity",
        "cnt": "count",
        "yr": "year",
    })

In [4]:
ddf = ddf.drop(["dteday"], axis=1)

In [5]:
cats= ["season","month","hour","holiday","weekday","workingday","weather"]

pipeline = make_pipeline(Categorizer(columns=cats), DummyEncoder(columns=cats))
ddf_onehot=pipeline.fit_transform(ddf)


In [6]:
def split_data(dataset, Target):
    X = dataset.loc[:, dataset.columns != Target]
    y = dataset.loc[:, Target]
    train_size = int(len(dataset) * 0.875)
    X_train, X_test, y_train, y_test = (
        X.loc[0:train_size-1], #dask uses loc instead of iloc, and -1 to avoid repeating in both train and test
        X.loc[train_size : len(dataset)],
        y.loc[0:train_size-1], #dask uses loc instead of iloc, and -1 to avoid repeating in both train and test
        y.loc[train_size : len(dataset)],
    )
    return X_train, X_test, y_train, y_test

In [7]:
x_train_reg, x_test_reg, y_train_reg, y_test_reg = split_data(ddf_onehot, "registered")
x_train_casual, x_test_casual, y_train_casual, y_test_casual = split_data(ddf_onehot, "casual")
x_train_count, x_test_count, y_train_count, y_test_count = split_data(ddf_onehot, "count")

## Converting to arrays


The model does not support dask dataframes, so I will try converting all df to dask arrays by using the .values method.

However, we need to drop 2 columns for each of the count, casual and regular; since dask arrays do not have a .drop method, it will be necessary to do it individually and creating 3 different set of arrays.

### Count

In [8]:
X_train_count = x_train_count.drop(["casual", "registered"], axis = 1)
X_test_count = x_test_count.drop(["casual", "registered"], axis = 1)

In [9]:
X_train_count, X_test_count, \
y_train_count, y_test_count = \
X_train_count.values, X_test_count.values, \
y_train_count.values, y_test_count.values

In [12]:
def score_lin(X_train, X_test, y_train, y_test):
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_pred = lm.predict(X_test)
    print("Intercept:", lm.intercept_)
    print("Coefficients:", lm.coef_)
    print("Mean squared error (MSE): {:.2f}".format(mean_squared_error(y_test, y_pred)))
    #print("Variance score (R2): {:.2f}".format(r2_score(y_test, y_pred)))
    return y_pred

In [13]:
baseline_count_pred = score_lin(X_train_count, X_test_count, y_train_count, y_test_count)

  contains = index in indices
  sub[blockwise_token(i)] = blockwise_token(indices.index(index))


Intercept: -46.43318465007447
Coefficients: [ 4.17647582e+01  4.53252527e-03  4.87626378e+01  1.17667996e+02
  9.30091170e+01 -8.26481840e+01 -3.01058523e+01 -1.75253500e+01
  1.14304510e+01 -2.08662539e+00  2.39666817e+01 -8.20126711e+00
 -5.91505986e+00  7.69758120e+00  4.26826718e+00  1.86602811e+01
  6.21483584e+00 -9.66955871e+00  6.56212016e+00  2.76754615e+01
  5.31186735e+00 -1.22861247e+01 -1.48111252e+01 -1.16727374e+02
 -1.34112145e+02 -1.42260526e+02 -1.53298974e+02 -1.56516659e+02
 -1.40373291e+02 -8.32833171e+01  4.63516696e+01  1.78710350e+02
  3.88320083e+01 -1.20205431e+01  1.20923659e+01  4.90669450e+01
  4.50572514e+01  2.98894422e+01  3.75689026e+01  9.76098363e+01
  2.50818309e+02  2.23470182e+02  1.19574877e+02  4.06760365e+01
 -7.82784263e+00 -4.48764167e+01 -8.45165319e+01  1.50493837e+01
 -3.18874244e+00  8.72942326e+00 -3.52678012e+00 -1.25204936e+00
  1.62954843e+00  3.17294209e+00  3.49486608e+00  6.80519685e+00
  8.46190854e-01  3.26495580e+00  1.55915858e+

  contains = index in indices


Mean squared error (MSE): 14814.84


In [15]:
baseline_count_pred.compute()

  contains = index in indices
  sub[blockwise_token(i)] = blockwise_token(indices.index(index))


array([338.0738301 , 259.17952237, 278.96130663, ..., 161.79918199,
       130.3656705 ,  83.29175121])