In [1]:
%load_ext watermark
%watermark

2019-05-23T15:45:22+01:00

CPython 3.6.8
IPython 7.5.0

compiler   : GCC 7.3.0
system     : Linux
release    : 4.15.0-50-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit


In [2]:
import warnings
warnings.simplefilter("ignore")

# Dask para Machine Learning

Dask está integrado con scikit-learn, esto significa que nos puede ayudar a gestionar nuestros datasets de dos formas distintas:

### 1. Usar Dask para entrenar modelos de datasets de small data usando un cluster. 

Es posible usar dask para acelerar el entrenamiento de modelos y la búsqueda de hiperparámetros para datasets que caben en memoria.

Vamos a trabajar con el dataset de viajes en taxi en Nueva York del 2014.

Primero exportamos el dataset a formato parquet:

In [3]:
import dask.dataframe as dd

In [None]:
dd.read_csv("../data/nyc_taxi_data_2014.csv").to_parquet("../data/nyc_taxi_data_2014.parquet")

Ahora podemos leerlo

In [4]:
taxi_2014 = dd.read_parquet("../data/nyc_taxi_data_2014.parquet")

In [5]:
taxi_2014.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,1,N,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,1,N,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,1,N,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,1,N,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,1,N,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75


En este apartado vamos a ver como dask puede acelerar el entrenamiento de datasets que caben en memoria. Por lo tanto voy a coger una muestra del dataset para que funcione bien (mi ordenador es potente pero es un portatil :) )

Podemos convertirlo a un dataframe de pandas haciendo `compute()`

In [6]:
taxi_2014 = taxi_2014.sample(frac=0.01, random_state=42).compute()

In [7]:
taxi_2014.shape

(149997, 18)

Creamos un pipeline normal de scikit-learn

In [8]:
from category_encoders import OneHotEncoder
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, make_union

from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

Vamos a hacer una busqueda de hiperparametros para entrenar un modelo predictivo que prediga la propina de un viaje en taxi en funcion de las características del mismo.

In [9]:
variable_objetivo = "tip_amount"
y = taxi_2014[variable_objetivo]
X = taxi_2014[["vendor_id", "store_and_fwd_flag", "payment_type",
               "rate_code", "pickup_longitude", "pickup_latitude", 
               "passenger_count"]]

In [10]:
pipeline = make_pipeline(
    make_union(
        make_pipeline(
            ColumnSelector(cols=["vendor_id", "store_and_fwd_flag", "payment_type", "rate_code"]),
            OneHotEncoder()
        ),
        make_pipeline(
            ColumnSelector(cols=["pickup_longitude", "pickup_latitude", "passenger_count"]),
            StandardScaler()
        )  
    ),
    SGDRegressor()
)

In [11]:
pipeline

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['vendor_id', 'store_and_fwd_flag', 'payment_type', 'rate_code'],
        drop_axis=False)), ('onehotencoder', OneHotEncoder(cols=None, d...m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))])

In [12]:
pipeline.steps

[('featureunion', FeatureUnion(n_jobs=None,
         transformer_list=[('pipeline-1', Pipeline(memory=None,
       steps=[('columnselector', ColumnSelector(cols=['vendor_id', 'store_and_fwd_flag', 'payment_type', 'rate_code'],
          drop_axis=False)), ('onehotencoder', OneHotEncoder(cols=None, drop_invariant=False, handle_unknown='impute',
         impu... drop_axis=False)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
         transformer_weights=None)),
 ('sgdregressor',
  SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
         eta0=0.01, fit_intercept=True, l1_ratio=0.15,
         learning_rate='invscaling', loss='squared_loss', max_iter=None,
         n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
         random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
         verbose=0, warm_start=False))]

Vamos a hacer ahora una búsqueda de hiperparámetros, pero vamos a usar dask para acelerar dicha busqueda. De dicha forma podremos explorar los parámetros de forma distribuida (con muchos ordenadores a la vez), lo que reduce el tiempo de búsqueda.

In [13]:
from dask.distributed import Client

In [14]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:40911  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 67.50 GB


Vemos la lista de parámetros disponible

In [15]:
sorted(pipeline.get_params().keys())

['featureunion',
 'featureunion__n_jobs',
 'featureunion__pipeline-1',
 'featureunion__pipeline-1__columnselector',
 'featureunion__pipeline-1__columnselector__cols',
 'featureunion__pipeline-1__columnselector__drop_axis',
 'featureunion__pipeline-1__memory',
 'featureunion__pipeline-1__onehotencoder',
 'featureunion__pipeline-1__onehotencoder__cols',
 'featureunion__pipeline-1__onehotencoder__drop_invariant',
 'featureunion__pipeline-1__onehotencoder__handle_unknown',
 'featureunion__pipeline-1__onehotencoder__impute_missing',
 'featureunion__pipeline-1__onehotencoder__return_df',
 'featureunion__pipeline-1__onehotencoder__use_cat_names',
 'featureunion__pipeline-1__onehotencoder__verbose',
 'featureunion__pipeline-1__steps',
 'featureunion__pipeline-2',
 'featureunion__pipeline-2__columnselector',
 'featureunion__pipeline-2__columnselector__cols',
 'featureunion__pipeline-2__columnselector__drop_axis',
 'featureunion__pipeline-2__memory',
 'featureunion__pipeline-2__standardscaler',


Ahora procedemos a hacer la búsqueda aleatoria, pero en vez de usar sklearn vamos a usar [dask_searchcv](https://dask-searchcv.readthedocs.io/en/latest/). Dicho paquete se install haciendo `conda install dask-searchcv -c conda-forge`

Tiene el mismo aspecto, pero usa dask como el gestor del entrenamiento.
Como estoy usando un solo ordenador no va a proporcionar mucho valor pero si tuviesemos un cluster se notaría aun más la diferencia.

In [16]:
from dask_searchcv import RandomizedSearchCV, GridSearchCV

param_dist = {
    "sgdregressor__penalty": ["l1", "l2", "elasticnet"],
    "sgdregressor__learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
    "sgdregressor__loss": ["squared_loss", "epsilon_insensitive"],
}


search = RandomizedSearchCV(pipeline, param_dist, cv=3)

In [17]:
%time search.fit(X, y)

CPU times: user 37 s, sys: 8.75 s, total: 45.7 s
Wall time: 25.9 s


RandomizedSearchCV(cache_cv=True, cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['vendor_id', 'store_and_fwd_flag', 'payment_type', 'rate_code'],
        drop_axis=False)), ('onehotencoder', OneHotEncoder(cols=None, d...m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))]),
          iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'sgdregressor__penalty': ['l1', 'l2', 'elasticnet'], 'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'sgdregressor__loss': ['squared_loss', 'epsilon_insensitive']},
          random_state=None, refit=True, return_train_score='warn',
          scheduler=None, scoring=None)

In [18]:
search.best_score_

0.2248068852281419

In [19]:
search.best_estimator_.get_params()["sgdregressor"]

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='epsilon_insensitive',
       max_iter=None, n_iter=None, n_iter_no_change=5,
       penalty='elasticnet', power_t=0.25, random_state=None, shuffle=True,
       tol=None, validation_fraction=0.1, verbose=0, warm_start=False)

Vamos a hacer la misma busqueda con scikit learn para ver las diferencias en tiempo de entrenamiento

In [20]:
from sklearn.model_selection import RandomizedSearchCV as SKRandomizedSearchCV
sk_search = SKRandomizedSearchCV(pipeline, param_dist, cv=3)

In [21]:
%time sk_search.fit(X, y)

CPU times: user 58.3 s, sys: 14.1 s, total: 1min 12s
Wall time: 51.6 s


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['vendor_id', 'store_and_fwd_flag', 'payment_type', 'rate_code'],
        drop_axis=False)), ('onehotencoder', OneHotEncoder(cols=None, d...m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'sgdregressor__penalty': ['l1', 'l2', 'elasticnet'], 'sgdregressor__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'], 'sgdregressor__loss': ['squared_loss', 'epsilon_insensitive']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

Vemos que usar la búsqueda de dask acelera aun usando solo nuestro ordenador. Ésto es asi por que [dask es más inteligente que sckit-learn a la hora de decidir como hacer los pasos necesarios para la búsqueda](https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html)

Scikit learn hace todos los pasos de la búsqueda para cada combinación de hiperparámetros,por ejemplo para la siguiente búsqueda de malla de un pipeline:

```
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())])

grid = {'vect__ngram_range': [(1, 1)],
        'tfidf__norm': ['l1', 'l2'],
        'clf__alpha': [1e-3, 1e-4, 1e-5]}
```

Scikit-learn hace ésto:
![](https://dask-ml.readthedocs.io/en/latest/_images/unmerged_grid_search_graph.svg)

Dask sabe que hay pasos que se pueden reutilizar (sin tener que recalcularlos) y hace un DAG (Directed Acyclic Graph) optimizado:

![](https://dask-ml.readthedocs.io/en/latest/_images/merged_grid_search_graph.svg)

### 2. Usar dask para entrenar modelos de scikit-learn de datasets que no caben en memoria

Usamos el paquete [dask-ml](https://ml.dask.org/) que contiene implementaciones de algoritmos de scikitlearn pero que funcionan con dask. Se instala de la forma habitual, haciendo:

`conda install -c conda-forge dask-ml` (desde la terminal como siempre)

dicho paquete contiene implementaciones de modelos de ML que pueden trabajar de forma paralela con un cluster usando dask. De dicha forma se pueden entrenar modelos con datos que no caben en memoria.

In [22]:
from dask_ml.linear_model import PartialSGDRegressor, PartialSGDClassifier, LogisticRegression
from dask_ml.preprocessing import StandardScaler, OneHotEncoder, DummyEncoder, Categorizer

Vamos a trabajar ahora con el dataset de 2017, recordemos que tiene 113 millones de filas.

In [23]:
taxi_2017 = dd.read_parquet("../data/2017_Yellow_Taxi_Trip_Data.parquet/*.parquet"
                           ).categorize(["VendorID"])
taxi_2017 = taxi_2017[taxi_2017["tip_amount"] >=0]

In [24]:
X = taxi_2017[["n_pasajeros", "distancia", "VendorID"]]
y = taxi_2017[variable_objetivo]

In [25]:
X

Unnamed: 0_level_0,n_pasajeros,distancia,VendorID
npartitions=165,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,category[known]
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


Ahora creamos el pipeline, pero usando cuando es posible los transformadores de `dask-ml`, hay que hacer ciertas manipulaciones para asegurarse de que trabaja con elementos de dask.

In [26]:
from sklearn.preprocessing import FunctionTransformer
from dask_ml.impute import SimpleImputer
from dask_ml.linear_model import LinearRegression
from dask_ml.wrappers import Incremental
from sklearn.linear_model import SGDRegressor

import dask.array as da
import numpy as np

In [28]:
ind_columns = X.columns.tolist()

pipeline = make_pipeline(
    FunctionTransformer(lambda x:  dd.from_array(x, columns=ind_columns), validate=False),
    Categorizer(categories={"VendorID": [[1,2], False]}),
    DummyEncoder(columns=["VendorID"]),
    FunctionTransformer(lambda x:  x.values, validate=False),
    Incremental(SGDRegressor())
)

In [29]:
pipeline.steps

[('functiontransformer-1',
  FunctionTransformer(accept_sparse=False, check_inverse=True,
            func=<function <lambda> at 0x7f6914087378>, inv_kw_args=None,
            inverse_func=None, kw_args=None, pass_y='deprecated',
            validate=False)),
 ('categorizer',
  Categorizer(categories={'VendorID': [[1, 2], False]}, columns=None)),
 ('dummyencoder', DummyEncoder(columns=['VendorID'], drop_first=False)),
 ('functiontransformer-2',
  FunctionTransformer(accept_sparse=False, check_inverse=True,
            func=<function <lambda> at 0x7f6914087ea0>, inv_kw_args=None,
            inverse_func=None, kw_args=None, pass_y='deprecated',
            validate=False)),
 ('incremental',
  Incremental(estimator=SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
         eta0=0.01, fit_intercept=True, l1_ratio=0.15,
         learning_rate='invscaling', loss='squared_loss', max_iter=None,
         n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
  

Ahora ajustamos el pipeline

In [30]:
%time pipeline.fit(X.values, y.values)

CPU times: user 22.7 s, sys: 2.18 s, total: 24.8 s
Wall time: 2min 31s


Pipeline(memory=None,
     steps=[('functiontransformer-1', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x7f6914087378>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=False)), ('categorizer', Categorizer(categories={'V...,
       verbose=0, warm_start=False),
      random_state=None, scoring=None, shuffle_blocks=True))])

Ha tardado 2 minutos, no está mal para un dataset que inicialmente era un csv que ocupaba 10 GB!

Ahora podemos usar `predict` con un array de dask como input

In [31]:
X.values

dask.array<values, shape=(nan, 3), dtype=object, chunksize=(nan, 3)>

In [32]:
pipeline.predict(X.values).to_dask_dataframe().head(10)

0    3.314809
1    1.116306
2    2.379460
3    1.206138
4    1.747883
5    1.751656
6    0.958237
7    1.333236
8    2.430601
9    3.484720
dtype: float64

podemos cargar el dataset de 2014 y generar predicciones 

In [33]:
taxi_2014 =  (dd
      .read_parquet("../data/nyc_taxi_data_2014.parquet/")
      .rename(columns={"vendor_id": "VendorID"})
      .categorize(["VendorID"])
     )[["passenger_count", "trip_distance", "VendorID"]]

In [34]:
taxi_2014.head()

Unnamed: 0,passenger_count,trip_distance,VendorID
0,1,0.7,CMT
1,1,1.4,CMT
2,2,2.3,CMT
3,1,1.7,CMT
4,1,0.9,CMT


In [35]:
taxi_2014.VendorID.unique().compute()

0    CMT
1    VTS
Name: VendorID, dtype: category
Categories (2, object): [CMT, VTS]

In [36]:
taxi_2014 = taxi_2014.assign(VendorID=taxi_2014.VendorID.str.replace('CMT', "1"))
taxi_2014 = taxi_2014.assign(VendorID=taxi_2014.VendorID.str.replace('VTS', "2"))
taxi_2014 = taxi_2014.assign(VendorID=taxi_2014.VendorID.astype(int))

In [37]:
predicciones = pipeline.predict(taxi_2014.values).to_dask_dataframe()

In [38]:
predicciones.head(10)

0    0.958412
1    1.234727
2    1.555958
3    1.353148
4    1.037359
5    1.037359
6    2.103145
7    1.511042
8    2.024198
9    1.589989
dtype: float64

Este pipeline no solo funciona con dataframes de dask, sino que tambien funciona con dataframes de pandas

In [39]:
# convertimos el dataset de dask a dataset de pandas
taxi_2014 = taxi_2014.compute()

In [40]:
type(taxi_2014)

pandas.core.frame.DataFrame

In [41]:
predicciones = pipeline.predict(taxi_2014.values).to_dask_dataframe()

In [42]:
predicciones.head(10)

0    0.958412
1    1.234727
2    1.555958
3    1.353148
4    1.037359
5    1.037359
6    2.103145
7    1.511042
8    2.024198
9    1.589989
dtype: float64