The purpose of this notebook is to build sklearn-like pipeline for transformation

In [1]:
import pandas as pd         

In [2]:
def load_data(path="../data/csv/all.csv"):
    dataframe = pd.read_csv(path, index_col=0)
    return dataframe.loc[~dataframe["execTimeMs"].isnull()]

In [218]:
def prepare_dataframe(dataframe):
    output = dataframe.dropna(axis="columns")
    targets = output["execTimeMs"]
    dropped = output[["command", "execTimeMs", "jobId", "ctime_mean", "ctime_max", "ctime_sum", "read_sum","write_sum","readSyscalls_sum","writeSyscalls_sum","readReal_sum","writeReal_sum","writeCancelled_sum","rxBytes_sum","rxPackets_sum","rxErrors_sum","rxDrop_sum","rxFifo_sum","rxFrame_sum","rxCompressed_sum","rxMulticast_sum","txBytes_sum","txPackets_sum","txErrors_sum","txDrop_sum","txFifo_sum","txColls_sum","txCarrier_sum","txCompressed_sum","cpu_mean","cpu_max","memory_mean","memory_max"]]
    features = output.drop(dropped.columns, axis=1)
    return features, targets, dropped

In [197]:
features, targets, _ = prepare_dataframe(load_data())

In [198]:
features.dtypes

workflowName          object
size                 float64
executable            object
args                  object
inputs                object
outputs               object
name                  object
cpu.manufacturer      object
cpu.brand             object
cpu.speed            float64
cpu.cores              int64
cpu.physicalCores      int64
cpu.processors         int64
mem.total              int64
mem.free               int64
mem.used               int64
mem.active             int64
mem.available          int64
mem.buffers            int64
mem.cached             int64
mem.slab               int64
mem.buffcache          int64
mem.swaptotal          int64
mem.swapused           int64
mem.swapfree           int64
dtype: object

# Preprocessing flow

In [199]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

In [200]:
def vectorize_list(series):
    def vectorize(list_string):
        return len(eval(list_string))
    return np.vectorize(vectorize)(series)

def ListTransformer():
    return FunctionTransformer(func=vectorize_list)

In [229]:
list_transformer = Pipeline(steps=[("list", ListTransformer()), ("scaler", StandardScaler())])
list_features = list(['args', 'inputs', 'outputs'])

numerical_transformer = StandardScaler()
numerical_features = list(features.select_dtypes(include="number").columns)
enhanced_numerical_features = numerical_features + ["read_sum", "write_sum", "cpu_mean", "memory_max"]

categorical_transformer = OneHotEncoder(sparse=False, handle_unknown = "ignore")
categorical_features = list(set(features.select_dtypes(include="object").columns) ^ set(list_features))
enhanced_categorical_features = categorical_features +  ["read_sum", "write_sum", "cpu_mean", "memory_max"]

preprocessor = ColumnTransformer(
    transformers=[
        ('lists', list_transformer, list_features),
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

enh_numerical_preprocessor = ColumnTransformer(
    transformers=[
        ('lists', list_transformer, list_features),
        ('num', numerical_transformer, enhanced_numerical_features),
        ('cat', categorical_transformer, categorical_features)])

enhanced_preprocessor = ColumnTransformer(
    transformers=[
        ('lists', list_transformer, list_features),
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, enhanced_categorical_features)]
)

In [202]:
from scipy.stats import percentileofscore
import math

def calculate_quantile_rank(labels, label):
    return percentileofscore(labels, label) / 100

def calculate_utilization_class(labels, label):
    def label_for_rank(rank):
        if rank > 0.75:
            return 'very high'
        elif rank > 0.5:
            return 'high'
        elif rank > 0.25:
            return 'medium'
        else:
            return 'low'
    return label_for_rank(calculate_quantile_rank(labels, label))

def calculate_utilization_bucket(labels, label, num_buckets):
    bucket_size = 1.0 / num_buckets
    def bucket_for_rank(rank):
        return str(math.floor(rank / bucket_size))
    return bucket_for_rank(calculate_quantile_rank(labels, label))

# Pipeline composition (with PCA)

In [203]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, SGDRegressor, ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV

In [204]:
base_steps = [('pca', PCA(random_state=42))]
dummy_pipeline = Pipeline(steps=base_steps +[('dummy', DummyRegressor())])
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=0)

In [205]:
pca_param_grid = {
    'pca__n_components': np.arange(1, 50, 3),    
}
knn_param_grid = {
    'knn__n_neighbors': np.arange(1, 30, 3),
}
regressor = ('knn', KNeighborsRegressor())
full_pipeline = Pipeline(steps= base_steps + [regressor])
grid_search = HalvingGridSearchCV(full_pipeline, {**knn_param_grid, **pca_param_grid}, cv=2, verbose=2, scoring="r2", n_jobs=-1)

In [321]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error


def rate_regressor(X_train, y_train, X_test, y_test, regressor, regressor_params, verbose=2, aggressive_elimination=True, steps=base_steps):
    print(f"Rating {regressor}")
    full_pipeline = Pipeline(steps= base_steps + [regressor])
    vector_length = X_train.shape[1]
    pca_param_grid = {'pca__n_components': np.arange(1, vector_length, 1),}
    grid_search = HalvingGridSearchCV(full_pipeline, {**pca_param_grid, **regressor_params}, cv=2, verbose=verbose, scoring="r2", n_jobs=3, aggressive_elimination=aggressive_elimination)
    grid_search.fit(X_train, y_train)
    
    # scores
    prediction = grid_search.best_estimator_.predict(X_test)
    
    r2 = r2_score(y_test, prediction)
    adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    mae = mean_absolute_error(y_test, prediction)
    mape = mean_absolute_percentage_error(y_test, prediction)
    
    return {"r2": r2, "adjusted_r2": adjusted_r2, "mae": mae, "mape": mape,"best_score": grid_search.best_score_, "params": grid_search.best_params_}

# Here go regressor params

In [207]:
knn = ("knn", KNeighborsRegressor())
knn_params = {'knn__n_neighbors': np.arange(1, 30, 2)}

dtr = ("dtr", DecisionTreeRegressor(random_state=5))
dtr_params = {"dtr__criterion": ["mse", "friedman_mse", "mae", "poisson"]}

# mlp = ("mlp", MLPRegressor())
# mlp_params = {"mlp__hidden_layer_sizes": np.arange(1,200, 10),
# #                 "mlp__activation": ["logistic", "tanh", "relu"],
#              "mlp__alpha": np.arange(0.01, 0.1, 0.01)}

lasso = ("lasso", Lasso(random_state=5))
lasso_params = {"lasso__alpha": np.arange(0.01, 1, 0.05)}

en = ("elasticnet", ElasticNet(random_state=5))
en_params = {"elasticnet__alpha": np.arange(0.01, 1, 0.05), "elasticnet__l1_ratio": np.arange(0, 1, 0.1)}

svr = ("svr", SGDRegressor())
svr_params = {"svr__loss": ["squared_loss", "huber", "epsilon_insensitive"], "svr__penalty": ['l2', 'l1', 'elasticnet'],
             "svr__alpha": np.arange(0.0001, 0.2, 0.01), "svr__max_iter": [10000]}

rf = ("rf", RandomForestRegressor())
rf_params = {"rf__n_estimators": np.arange(5, 100, 5), "rf__criterion": ["mae", "mse"], "rf__max_features": ["auto", "sqrt", "log2"]}

In [208]:
import warnings
warnings.filterwarnings('ignore')

In [209]:
def make_datasets(dataframe):
    jobs_below_1200ms = dataframe.loc[dataframe["execTimeMs"] < 1200]
    jobs_between_2000ms_25000ms = dataframe.loc[dataframe["execTimeMs"].between(2000, 25000)]
    jobs_count = dataframe["name"].value_counts()
    jobs_most_occuring = dataframe.loc[dataframe["name"].isin(jobs_count[jobs_count > 3000].index.values)]
    jobs_mDiffFit = dataframe.loc[dataframe["name"] == "mDiffFit"]
    jobs_haplotype = dataframe.loc[dataframe["name"] == "haplotype_caller"]
    jobs_mShrink = dataframe.loc[dataframe["name"] == "mShrink"]
    return jobs_below_1200ms, jobs_between_2000ms_25000ms, jobs_most_occuring

In [210]:
datasets = make_datasets(load_data())

In [211]:
dfs_for_jobs = [pd.DataFrame(y) for x, y in load_data().groupby('name', as_index=False)]

In [272]:
def rate_data(features, targets, regressors, verbose=2, pipeline_steps=base_steps):
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=0)
    df = pd.DataFrame(columns=["name", "pca", "adjusted_r2","r2", "mae", "mape", "best_score", "params"])
    for (regressor, params) in regressors:
        result = rate_regressor(X_train, y_train, X_test, y_test, regressor, params, verbose, pipeline_steps)
        df = df.append({"name": regressor[0], **result, "pca": result["params"]["pca__n_components"]}, ignore_index=True)
    return df

In [213]:
def rate_dataset(dataframe, regressors, verbose=2):
    print(f"Rating dataset of len {len(dataframe)}")
    features, targets, _ = prepare_dataframe(dataframe[:10000])
    rate_data(features, targets, regressors, verbose)

In [320]:
basic_regressors = [
    (knn, knn_params),
    (dtr, dtr_params),
    (lasso, lasso_params),
    (en, en_params),
    (svr, svr_params),
##     (rf, rf_params)
]

In [215]:
def simple_experiment():
    print("Rating jobs datasets")
    for dataset in dfs_for_jobs:
        print(dataset.iloc[0]["name"])
        rate_dataset(dataset, basic_regressors)

    print("Rating common datasets")
    for dataset in datasets:
        rate_dataset(dataset, basic_regressors)

In [216]:
# rate_dataset(dfs_for_jobs[8], verbose=0)
#rate_dataset(dfs_for_jobs[1], basic_regressors, verbose=0)

## Z powyższego mikroeksperymentu - KNN i Lasso jako testowe

## Eksperyment 1

### Dane 
krótsze niż 1200ms, 2k-25k ms, częstsze niż 3000, wszystkie

In [298]:
raw_dataframe = load_data()
jobs_count = raw_dataframe["name"].value_counts()
exp1_datasets = {
    "ShorterThan1.2Kms": raw_dataframe.loc[raw_dataframe["execTimeMs"] < 1200],
    "Between2KmsAnd25Kms": raw_dataframe.loc[raw_dataframe["execTimeMs"].between(2000, 25000)],
    "ExecutedMoreThan3Ktimes": raw_dataframe.loc[raw_dataframe["name"].isin(jobs_count[jobs_count > 3000].index.values)],
    "All": raw_dataframe
}

def get_pipeline1_data(data):
    """
    Simply raw data, without any enhancement
    """
    features, labels, _ = prepare_dataframe(data)
    features = preprocessor.fit_transform(features)
    return features, labels

def get_pipeline2_data(data):
    """
    Raw data enhanced with resource utilization classes (read, write, cpu, ram), but classes are assigned - not predicted
    """
    features, labels, dropped = prepare_dataframe(data)
    ranked_columns = ["read_sum", "write_sum", "cpu_mean", "memory_max"]
    for column in ranked_columns:
        features[column] = dropped[column].map(lambda value: calculate_utilization_class(dropped[column], value))
    features = enhanced_preprocessor.fit_transform(features)
    return features, labels

def get_pipeline3_data(data):
    """
    Raw data enhanced with resource utilization quantile scores (read, write, cpu, ram), but scores are assigned - not predicted
    """
    features, labels, dropped = prepare_dataframe(data)
    ranked_columns = ["read_sum", "write_sum", "cpu_mean", "memory_max"]
    for column in ranked_columns:
        features[column] = dropped[column].map(lambda value: calculate_quantile_rank(dropped[column], value))
    features = enh_numerical_preprocessor.fit_transform(features)
    return features, labels

def get_pipeline4_data(data, num_buckets=5):
    """
    Raw data enhanced with resource utilization buckets (read, write, cpu, ram), but classes are assigned - not predicted
    There are 4 possible utilization classes, but number of buckets is configurable.
    get_pipeline4_data(d, 4) == get_pipeline2_data(d)
    """
    features, labels, dropped = prepare_dataframe(data)
    ranked_columns = ["read_sum", "write_sum", "cpu_mean", "memory_max"]
    for column in ranked_columns:
        features[column] = dropped[column].map(lambda value: calculate_utilization_bucket(dropped[column], value, num_buckets))
    features = enhanced_preprocessor.fit_transform(features)
    return features, labels

### Przebieg
dla każdego zbioru sprawdzamy, który pipeline jest lepszy i w jakim wariancie

In [319]:
def run_experiment1():
    full_df = pd.DataFrame(columns=["dataset", "pipeline", "name", "pca", "adjusted_r2","r2", "mae", "mape", "best_score", "params"])
    for name, dataset in exp1_datasets.items():
        print(f"Evaluating dataset {name} of length {len(dataset)}")
        def run_pipeline(name, features, targets):
            pipeline_df = rate_data(features, targets, basic_regressors, verbose=0)
            pipeline_df["pipeline"] = name
            return pipeline_df
        dataset_df = pd.concat([
            run_pipeline("simple", *get_pipeline1_data(dataset)),
            run_pipeline("two_step_4_cat", *get_pipeline2_data(dataset)),
            run_pipeline("two_step_continuous", *get_pipeline3_data(dataset)),
            run_pipeline("two_step_5_cat", *get_pipeline4_data(dataset, 5)),
            run_pipeline("two_step_6_cat", *get_pipeline4_data(dataset, 6)),
            run_pipeline("two_step_7_cat", *get_pipeline4_data(dataset, 7)),
            run_pipeline("two_step_8_cat", *get_pipeline4_data(dataset, 8)),
            run_pipeline("two_step_9_cat", *get_pipeline4_data(dataset, 9)),
            run_pipeline("two_step_10_cat", *get_pipeline4_data(dataset, 10)),
        ])
        dataset_df["dataset"] = name
        full_df = full_df.append(dataset_df)
    return full_df

In [322]:
exp1_df = run_experiment1()
exp1_df.to_csv("exp1.csv")

Evaluating dataset ShorterThan1.2Kms of length 137871
Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Rating ('lasso', Lasso(random_state=5))
Rating ('elasticnet', ElasticNet(random_state=5))
Rating ('svr', SGDRegressor())
Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Rating ('lasso', Lasso(random_state=5))
Rating ('elasticnet', ElasticNet(random_state=5))
Rating ('svr', SGDRegressor())
Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Rating ('lasso', Lasso(random_state=5))
Rating ('elasticnet', ElasticNet(random_state=5))
Rating ('svr', SGDRegressor())
Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Rating ('lasso', Lasso(random_state=5))
Rating ('elasticnet', ElasticNet(random_state=5))
Rating ('svr', SGDRegressor())
Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Ra

In [326]:
exp1_df.to_csv("backup_1.csv")

### Output eksperymentu
wykresy: R^2, MAE i MAPE dla każdego zbioru i każdego pipeline'u (powiedzmy że dla 3 najlepszych z każdego)
odpowiedzi: czy I, czy II, czy dzielić na więcej niż 4 kategorie

In [282]:
test_df = rate_data(*get_pipeline1_data(dfs_for_jobs[0]), basic_regressors, verbose=0)

Rating ('knn', KNeighborsRegressor())
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Rating ('lasso', Lasso(random_state=5))
Rating ('elasticnet', ElasticNet(random_state=5))
Rating ('svr', SGDRegressor())


Unnamed: 0,name,pca,adjusted_r2,r2,mae,mape,best_score,params,dataset
0,knn,24,0.263035,0.419514,88.738581,0.140505,0.435897,"{'knn__n_neighbors': 7, 'pca__n_components': 24}",ab
1,dtr,29,-0.653621,-0.30251,121.040816,0.183098,0.258408,"{'dtr__criterion': 'poisson', 'pca__n_componen...",ab
2,lasso,30,0.274796,0.428777,89.789489,0.135582,0.421197,"{'lasso__alpha': 0.9600000000000001, 'pca__n_c...",ab
3,elasticnet,29,0.279193,0.432241,89.493725,0.135116,0.422105,"{'elasticnet__alpha': 0.21000000000000002, 'el...",ab
4,svr,8,0.290906,0.441467,90.094516,0.137391,0.432413,"{'pca__n_components': 8, 'svr__alpha': 0.10010...",ab


In [None]:
rate_data(*get_pipeline2_data(dfs_for_jobs[1]), basic_regressors, verbose=0)

In [None]:
rate_data(*get_pipeline3_data(dfs_for_jobs[1]), basic_regressors, verbose=0)

In [234]:
for num_buckets in range(1, 10):
    print(f"BUCKETS: {num_buckets}")
    rate_data(*get_pipeline4_data(dfs_for_jobs[1], num_buckets), basic_regressors, verbose=0)
    print("\n")

BUCKETS: 1
Rating regressor ('knn', KNeighborsRegressor())
test R2 0.4522747711769818 adjusted R2 0.25629968930452574 	 params {'knn__n_neighbors': 1, 'pca__n_components': 33}
Rating regressor ('dtr', DecisionTreeRegressor(random_state=5))
test R2 0.16555254049000412 adjusted R2 -0.13301122942641652 	 params {'dtr__criterion': 'mse', 'pca__n_components': 29}
Rating regressor ('lasso', Lasso(random_state=5))
test R2 -3.000750405374376 adjusted R2 -4.432211559590896 	 params {'lasso__alpha': 0.41000000000000003, 'pca__n_components': 33}
Rating regressor ('elasticnet', ElasticNet(random_state=5))
test R2 -6.4447639094097955 adjusted R2 -9.108486776079356 	 params {'elasticnet__alpha': 0.9600000000000001, 'elasticnet__l1_ratio': 0.9, 'pca__n_components': 37}
Rating regressor ('svr', SGDRegressor())
test R2 -43315494.238614306 adjusted R2 -58813698.95701759 	 params {'pca__n_components': 19, 'svr__alpha': 0.1101, 'svr__loss': 'epsilon_insensitive', 'svr__max_iter': 10000, 'svr__penalty': 'e

In [235]:
for num_buckets in range(10, 20):
    print(f"BUCKETS: {num_buckets}")
    rate_data(*get_pipeline4_data(dfs_for_jobs[1], num_buckets), basic_regressors, verbose=0)
    print("\n")

BUCKETS: 10
Rating regressor ('knn', KNeighborsRegressor())
test R2 0.7276179176091213 adjusted R2 0.47646041306688247 	 params {'knn__n_neighbors': 1, 'pca__n_components': 63}
Rating regressor ('dtr', DecisionTreeRegressor(random_state=5))
test R2 0.16109396354149208 adjusted R2 -0.6124427713747944 	 params {'dtr__criterion': 'poisson', 'pca__n_components': 67}
Rating regressor ('lasso', Lasso(random_state=5))
test R2 0.8019826536093838 adjusted R2 0.6193952303141403 	 params {'lasso__alpha': 0.9600000000000001, 'pca__n_components': 69}
Rating regressor ('elasticnet', ElasticNet(random_state=5))
test R2 0.7044021237245739 adjusted R2 0.4318378481978823 	 params {'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.5, 'pca__n_components': 31}
Rating regressor ('svr', SGDRegressor())
test R2 0.009750849080299773 adjusted R2 -0.9033360303391642 	 params {'pca__n_components': 27, 'svr__alpha': 0.1601, 'svr__loss': 'squared_loss', 'svr__max_iter': 10000, 'svr__penalty': 'elasticnet'}
END



## Eksperyment 2
### Dane
krótsze niż 1200ms, 2k-25k ms, częstsze niż 3000, wszystkie

### Przebieg
Znajdujemy najlepszy klasyfikator dla każdego typu zasobów

### Output eksperymentu
Klasyfikatory i ich parametry

In [241]:
# TODO

## Eksperyment 3

In [242]:
# TODO