The purpose of this notebook is to build sklearn-like pipeline for transformation

In [6]:
import pandas as pd         

In [232]:
def load_data(path="../data/csv/all.csv"):
    dataframe = pd.read_csv(path, index_col=0)[:50000]
    return dataframe.loc[~dataframe["execTimeMs"].isnull()]

In [233]:
def prepare_dataframe(dataframe):
    output = dataframe.dropna(axis="columns")
    targets = output["execTimeMs"]
    features = output.drop(["execTimeMs", "jobId", "ctime_mean", "ctime_max", "ctime_sum"], axis=1)
    return features, targets

In [234]:
features, targets = prepare_dataframe(load_data())

In [235]:
features.dtypes

read_sum              float64
write_sum             float64
readSyscalls_sum      float64
writeSyscalls_sum     float64
readReal_sum          float64
writeReal_sum         float64
writeCancelled_sum    float64
rxBytes_sum           float64
rxPackets_sum         float64
rxErrors_sum          float64
rxDrop_sum            float64
rxFifo_sum            float64
rxFrame_sum           float64
rxCompressed_sum      float64
rxMulticast_sum       float64
txBytes_sum           float64
txPackets_sum         float64
txErrors_sum          float64
txDrop_sum            float64
txFifo_sum            float64
txColls_sum           float64
txCarrier_sum         float64
txCompressed_sum      float64
cpu_mean              float64
cpu_max               float64
memory_mean           float64
memory_max            float64
workflowName           object
size                  float64
executable             object
args                   object
inputs                 object
outputs                object
name      

# Preprocessing flow

In [236]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

In [237]:
def vectorize_list(series):
    def vectorize(list_string):
        return len(eval(list_string))
    return np.vectorize(vectorize)(series)

def ListTransformer():
    return FunctionTransformer(func=vectorize_list)

In [238]:
ignored_features = ['command']

list_transformer = Pipeline(steps=[("list", ListTransformer()), ("scaler", StandardScaler())])
list_features = list(['args', 'inputs', 'outputs'])

numerical_transformer = StandardScaler()
numerical_features = list(features.select_dtypes(include="number").columns)

categorical_transformer = OneHotEncoder(sparse=False, handle_unknown = "ignore")
categorical_features = list(set(features.select_dtypes(include="object").columns) ^ set(list_features) ^ set(ignored_features))

preprocessor = ColumnTransformer(
    transformers=[
        ('lists', list_transformer, list_features),
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Pipeline composition (with PCA)

In [239]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV

In [240]:
base_steps = [('preprocessor', preprocessor),
              ('pca', PCA())]
dummy_pipeline = Pipeline(steps=base_steps +[('dummy', DummyRegressor())])
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=0)

In [241]:
pca_param_grid = {
    'pca__n_components': np.arange(1, 50, 3),    
}
knn_param_grid = {
    'knn__n_neighbors': np.arange(1, 30, 3),
}
regressor = ('knn', KNeighborsRegressor())
full_pipeline = Pipeline(steps= base_steps + [regressor])
grid_search = HalvingGridSearchCV(full_pipeline, {**knn_param_grid, **pca_param_grid}, cv=2, verbose=2, scoring="r2", n_jobs=-1)

In [246]:
from sklearn.metrics import r2_score

def rate_regressor(X_train, y_train, X_test, y_test, regressor, regressor_params, aggressive_elimination=True):
    full_pipeline = Pipeline(steps= base_steps + [regressor])
    pca_param_grid = {    'pca__n_components': np.arange(1, 50, 3),    }
    grid_search = HalvingGridSearchCV(full_pipeline, {**pca_param_grid, **regressor_params}, cv=2, verbose=2, scoring="r2", n_jobs=3, aggressive_elimination=aggressive_elimination)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    print(r2_score(grid_search.best_estimator_.predict(X_test), y_test))
    return grid_search

# Here go regressor params

In [247]:
knn = ("knn", KNeighborsRegressor())
knn_params = {'knn__n_neighbors': np.arange(1, 30, 2)}

dtr = ("dtr", DecisionTreeRegressor())
dtr_params = {"dtr__criterion": ["mse", "friedman_mse", "mae", "poisson"]}

mlp = ("mlp", MLPRegressor())
mlp_params = {"mlp__hidden_layer_sizes": np.arange(1,200, 10),
#                 "mlp__activation": ["logistic", "tanh", "relu"],
             "mlp__alpha": np.arange(0.01, 0.1, 0.01)}

lasso = ("lasso", Lasso())
lasso_params = {"lasso__alpha": np.arange(0.01, 1, 0.05)}

svr = ("svr", SGDRegressor())
svr_params = {"svr__loss": ["squared_loss", "huber", "epsilon_insensitive"], "svr__penalty": ['l2', 'l1', 'elasticnet'],
             "svr__alpha": np.arange(0.0001, 0.2, 0.01), "svr__max_iter": [10000]}

rf = ("rf", RandomForestRegressor())
rf_params = {"rf__n_estimators": np.arange(5, 100, 5), "rf__criterion": ["mae", "mse"], "rf__max_features": ["auto", "sqrt", "log2"]}

In [248]:
import warnings
warnings.filterwarnings('ignore')

In [249]:
knn_cv = rate_regressor(X_train, y_train, X_test, y_test, knn, knn_params)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 164
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 255
n_resources: 164
Fitting 2 folds for each of 255 candidates, totalling 510 fits
----------
iter: 1
n_candidates: 85
n_resources: 164
Fitting 2 folds for each of 85 candidates, totalling 170 fits
----------
iter: 2
n_candidates: 29
n_resources: 492
Fitting 2 folds for each of 29 candidates, totalling 58 fits
----------
iter: 3
n_candidates: 10
n_resources: 1476
Fitting 2 folds for each of 10 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 4
n_resources: 4428
Fitting 2 folds for each of 4 candidates, totalling 8 fits
----------
iter: 5
n_candidates: 2
n_resources: 13284
Fitting 2 folds for each of 2 candidates, totalling 4 fits
0.2169224587779534
{'knn__n_neighbors': 1, 'pca__n_components': 31}
0.6943388908256861


In [250]:
dtr_cv = rate_regressor(X_train, y_train, X_test, y_test, dtr, dtr_params)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 1481
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 68
n_resources: 1481
Fitting 2 folds for each of 68 candidates, totalling 136 fits
----------
iter: 1
n_candidates: 23
n_resources: 4443
Fitting 2 folds for each of 23 candidates, totalling 46 fits
----------
iter: 2
n_candidates: 8
n_resources: 13329
Fitting 2 folds for each of 8 candidates, totalling 16 fits
----------
iter: 3
n_candidates: 3
n_resources: 39987
Fitting 2 folds for each of 3 candidates, totalling 6 fits
0.734967982694674
{'dtr__criterion': 'poisson', 'pca__n_components': 28}
0.6959232741766164


In [251]:
mlp_cv = rate_regressor(X_train, y_train, X_test, y_test, mlp, mlp_params)

n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 18
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 3060
n_resources: 18
Fitting 2 folds for each of 3060 candidates, totalling 6120 fits
----------
iter: 1
n_candidates: 1020
n_resources: 54
Fitting 2 folds for each of 1020 candidates, totalling 2040 fits
----------
iter: 2
n_candidates: 340
n_resources: 162
Fitting 2 folds for each of 340 candidates, totalling 680 fits
----------
iter: 3
n_candidates: 114
n_resources: 486
Fitting 2 folds for each of 114 candidates, totalling 228 fits
----------
iter: 4
n_candidates: 38
n_resources: 1458
Fitting 2 folds for each of 38 candidates, totalling 76 fits
----------
iter: 5
n_candidates: 13
n_resources: 4374
Fitting 2 folds for each of 13 candidates, totalling 26 fits
----------
iter: 6
n_candidates: 5
n_resources: 13122
Fitting 2 folds for each of 5 candidates, totalling 10 fits
----------
iter: 7
n_candidates: 2
n_r

In [252]:
lasso_cv = rate_regressor(X_train, y_train, X_test, y_test, lasso, lasso_params)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 164
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 340
n_resources: 164
Fitting 2 folds for each of 340 candidates, totalling 680 fits
----------
iter: 1
n_candidates: 114
n_resources: 164
Fitting 2 folds for each of 114 candidates, totalling 228 fits
----------
iter: 2
n_candidates: 38
n_resources: 492
Fitting 2 folds for each of 38 candidates, totalling 76 fits
----------
iter: 3
n_candidates: 13
n_resources: 1476
Fitting 2 folds for each of 13 candidates, totalling 26 fits
----------
iter: 4
n_candidates: 5
n_resources: 4428
Fitting 2 folds for each of 5 candidates, totalling 10 fits
----------
iter: 5
n_candidates: 2
n_resources: 13284
Fitting 2 folds for each of 2 candidates, totalling 4 fits
0.8785268929407755
{'lasso__alpha': 0.060000000000000005, 'pca__n_components': 25}
0.9499696323840465


In [253]:
svr_cv = rate_regressor(X_train, y_train, X_test, y_test, svr, svr_params)

n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 18
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 3060
n_resources: 18
Fitting 2 folds for each of 3060 candidates, totalling 6120 fits
----------
iter: 1
n_candidates: 1020
n_resources: 54
Fitting 2 folds for each of 1020 candidates, totalling 2040 fits
----------
iter: 2
n_candidates: 340
n_resources: 162
Fitting 2 folds for each of 340 candidates, totalling 680 fits
----------
iter: 3
n_candidates: 114
n_resources: 486
Fitting 2 folds for each of 114 candidates, totalling 228 fits
----------
iter: 4
n_candidates: 38
n_resources: 1458
Fitting 2 folds for each of 38 candidates, totalling 76 fits
----------
iter: 5
n_candidates: 13
n_resources: 4374
Fitting 2 folds for each of 13 candidates, totalling 26 fits
----------
iter: 6
n_candidates: 5
n_resources: 13122
Fitting 2 folds for each of 5 candidates, totalling 10 fits
----------
iter: 7
n_candidates: 2
n_r

In [254]:
rf_cv = rate_regressor(X_train, y_train, X_test, y_test, rf, rf_params)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 54
max_resources_: 39998
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 1938
n_resources: 54
Fitting 2 folds for each of 1938 candidates, totalling 3876 fits
----------
iter: 1
n_candidates: 646
n_resources: 162
Fitting 2 folds for each of 646 candidates, totalling 1292 fits
----------
iter: 2
n_candidates: 216
n_resources: 486
Fitting 2 folds for each of 216 candidates, totalling 432 fits
----------
iter: 3
n_candidates: 72
n_resources: 1458
Fitting 2 folds for each of 72 candidates, totalling 144 fits
----------
iter: 4
n_candidates: 24
n_resources: 4374
Fitting 2 folds for each of 24 candidates, totalling 48 fits
----------
iter: 5
n_candidates: 8
n_resources: 13122
Fitting 2 folds for each of 8 candidates, totalling 16 fits
----------
iter: 6
n_candidates: 3
n_resources: 39366
Fitting 2 folds for each of 3 candidates, totalling 6 fits
0.652238697379432
{'pca__n_components': 28

In [257]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(lasso_cv.predict(X_test), y_test)

5109.67475816398

In [258]:
y_test.std()

500479.9367198438

In [265]:
pd.DataFrame(lasso_cv.cv_results_).mean_test_score

0      0.398289
1      0.640595
2      0.853540
3      0.883260
4      0.914458
         ...   
507    0.947661
508    0.947619
509    0.947678
510    0.878488
511    0.878527
Name: mean_test_score, Length: 512, dtype: float64