The purpose of this notebook is to build sklearn-like pipeline for transformation

In [1]:
import pandas as pd         
import os.path

N_JOBS = 6
DEBUG=True

In [2]:
def load_data(path="../../data/csv/all_v2.csv"):
    dataframe = pd.read_csv(path, index_col=0)
    return dataframe.loc[~dataframe["execTimeMs"].isnull()]

In [3]:
def prepare_dataframe(dataframe):
    output = dataframe.dropna(axis="columns")
    targets = output["execTimeMs"]
    dropped = output[["command", "execTimeMs", "jobId", "ctime_mean", "ctime_max", "ctime_sum", "read_sum","write_sum","readSyscalls_sum","writeSyscalls_sum","readReal_sum","writeReal_sum","writeCancelled_sum","rxBytes_sum","rxPackets_sum","rxErrors_sum","rxDrop_sum","rxFifo_sum","rxFrame_sum","rxCompressed_sum","rxMulticast_sum","txBytes_sum","txPackets_sum","txErrors_sum","txDrop_sum","txFifo_sum","txColls_sum","txCarrier_sum","txCompressed_sum","cpu_mean","cpu_max","memory_mean","memory_max"]]
    features = output.drop(dropped.columns, axis=1)
    return features, targets, dropped

In [4]:
features, targets, dropped = prepare_dataframe(load_data())

In [5]:
features.dtypes

workflowName              object
size                     float64
executable                object
args                      object
inputs                    object
outputs                   object
name                      object
cpu.manufacturer          object
cpu.brand                 object
cpu.speed                float64
cpu.cores                  int64
cpu.physicalCores          int64
cpu.processors             int64
mem.total                  int64
mem.free                   int64
mem.used                   int64
mem.active                 int64
mem.available              int64
mem.buffers                int64
mem.cached                 int64
mem.slab                   int64
mem.buffcache              int64
mem.swaptotal              int64
mem.swapused               int64
mem.swapfree               int64
total_cpus               float64
avg_cpus                 float64
avg_pods                 float64
total_ram_available        int64
average_ram_available    float64
dtype: obj

# Preprocessing flow

In [6]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

In [7]:
def vectorize_list(series):
    def vectorize(list_string):
        return len(eval(list_string))
    return np.vectorize(vectorize)(series)

def ListTransformer():
    return FunctionTransformer(func=vectorize_list)

In [8]:
list_transformer = Pipeline(steps=[("list", ListTransformer()), ("scaler", StandardScaler())])
list_features = list(['args', 'inputs', 'outputs'])

numerical_transformer = StandardScaler()
numerical_features = list(features.select_dtypes(include="number").columns)

categorical_transformer = OneHotEncoder(sparse=False, handle_unknown = "ignore")
categorical_features = list(set(features.select_dtypes(include="object").columns) ^ set(list_features))

def make_classifying_preprocessor(additional_features=["read_sum", "write_sum", "cpu_mean", "memory_max"]):
    external_features = categorical_features + additional_features
    return ColumnTransformer(
            transformers=[('lists', list_transformer, list_features), 
                          ('num', numerical_transformer, numerical_features),
                          ('cat', categorical_transformer, external_features)])

def make_regression_preprocessor(additional_features=["read_sum", "write_sum", "cpu_mean", "memory_max"]):
    external_features = numerical_features + additional_features
    return ColumnTransformer(
        transformers=[
            ('lists', list_transformer, list_features),            
            ('num', numerical_transformer, external_features),  
            ('cat', categorical_transformer, categorical_features)
        ])

preprocessor = make_classifying_preprocessor(additional_features=[])

In [9]:
from scipy.stats import percentileofscore
import math

def calculate_quantile_rank(labels, label):
    return percentileofscore(labels, label) / 100

def calculate_utilization_class(labels, label):
    def label_for_rank(rank):
        if rank > 0.75:
            return 'very high'
        elif rank > 0.5:
            return 'high'
        elif rank > 0.25:
            return 'medium'
        else:
            return 'low'
    return label_for_rank(calculate_quantile_rank(labels, label))

def calculate_utilization_bucket(labels, label, num_buckets):
    bucket_size = 1.0 / num_buckets
    def bucket_for_rank(rank):
        return str(math.floor(rank / bucket_size))
    return bucket_for_rank(calculate_quantile_rank(labels, label))

# Pipeline composition (with PCA)

In [10]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import Lasso, SGDRegressor, ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.dummy import DummyRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.kernel_ridge import KernelRidge

In [11]:
base_steps = [('pca', PCA(random_state=42))]
dummy_pipeline = Pipeline(steps=base_steps +[('dummy', DummyRegressor())])
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=0)

In [12]:
pca_param_grid = {
    'pca__n_components': np.arange(1, 50, 1),    
}
knn_param_grid = {
    'knn__n_neighbors': np.arange(1, 30, 3),
}
regressor = ('knn', KNeighborsRegressor())
full_pipeline = Pipeline(steps= base_steps + [regressor])
grid_search = HalvingGridSearchCV(full_pipeline, {**knn_param_grid, **pca_param_grid}, cv=2, verbose=2, scoring="r2", n_jobs=-1)

In [13]:
def rae(actual, predicted):
    """ Relative Absolute Error (aka Approximation Error) """
    EPSILON=1e-10
    return np.sum(np.abs(actual - predicted)) / (np.sum(np.abs(actual - np.mean(actual))) + EPSILON)

In [14]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, make_scorer

In [15]:
rae_scorer=make_scorer(rae, greater_is_better=False)

In [16]:
def calculate_regression_score(true, pred, scores=[r2_score, mean_absolute_error, mean_absolute_percentage_error, rae]):
    executor = get_reusable_executor(max_workers=4)
    results = executor.map(lambda fun: fun(true, pred), scores)
    return results

In [17]:
from loky import get_reusable_executor

def rate_regressor(X_train, y_train, X_test, y_test, regressor, regressor_params, verbose=10, aggressive_elimination=True, steps=base_steps, scoring="r2"):
    if DEBUG:
        print(f"Rating {regressor}")
    full_pipeline = Pipeline(steps= base_steps + [regressor])
    vector_length = min(X_train.shape[0], X_train.shape[1])
    pca_param_grid = {'pca__n_components': np.arange(1, vector_length, 1),}
    grid_search = HalvingGridSearchCV(full_pipeline, {**pca_param_grid, **regressor_params}, cv=2, verbose=verbose, scoring=scoring, n_jobs=N_JOBS)
    if DEBUG:
        print("Evaluating grid search")
    grid_search.fit(X_train, y_train)
    
    # scores
    if DEBUG:
        print("Predicting on test set")
    prediction = grid_search.best_estimator_.predict(X_test)
    
    if DEBUG:
        print("Calculating scores")
    r2, mae, mape, rae = calculate_regression_score(y_test, prediction)
    if DEBUG:
        print("Calculated scores on test set")
    adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    return {"r2": r2, "adjusted_r2": adjusted_r2, "mae": mae, "mape": mape, "rae": rae,"best_score": grid_search.best_score_, "params": grid_search.best_params_}

# Here go regressor params

In [18]:
knn = ("knn", KNeighborsRegressor())
knn_params = {'knn__n_neighbors': np.arange(1, 30, 1)}

lasso = ("lasso", Lasso(random_state=5))
lasso_params = {"lasso__alpha": np.arange(0.01, 1, 0.05)}

mlp_regressor = ("mlp", MLPRegressor(random_state=5))
mlp_params = {"mlp__activation": ["relu", "logistic"], "mlp__hidden_layer_sizes": [(100,), (100, 50,)]}

dtr = ("dtr", DecisionTreeRegressor(random_state=5))
dtr_params = {"dtr__criterion": ["mse", "friedman_mse", "mae", "poisson"], "dtr__max_depth": [5, 10, 15, 25]}

en = ("elasticnet", ElasticNet(random_state=5))
en_params = {"elasticnet__alpha": np.arange(0.01, 1, 0.05), "elasticnet__l1_ratio": np.arange(0, 1, 0.1)}

svr = ("svr", SGDRegressor())
svr_params = {"svr__loss": ["squared_loss", "huber", "epsilon_insensitive"], "svr__penalty": ['l2', 'l1', 'elasticnet'],
             "svr__alpha": np.arange(0.0001, 0.2, 0.01), "svr__max_iter": [10000]}

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
def rate_data(features, targets, regressors, verbose=10, pipeline_steps=base_steps):
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=0)
    df = pd.DataFrame(columns=["name", "pca", "adjusted_r2","r2", "mae", "mape", "rae", "best_score", "params"])
    for (regressor, params) in regressors:
        result = rate_regressor(X_train, y_train, X_test, y_test, regressor, params, verbose, pipeline_steps)
        df = df.append({"name": regressor[0], **result, "pca": result["params"]["pca__n_components"]}, ignore_index=True)
    return df

In [21]:
def rate_data_explicit(X_train, X_test, y_train, y_test, regressors, verbose=10, pipeline_steps=base_steps, scoring="r2"):
    df = pd.DataFrame(columns=["name", "pca", "adjusted_r2","r2", "mae", "mape", "best_score", "params"])
    for (regressor, params) in regressors:
        result = rate_regressor(X_train, y_train, X_test, y_test, regressor, params, verbose, pipeline_steps, scoring=scoring)
        df = df.append({"name": regressor[0], **result, "pca": result["params"]["pca__n_components"]}, ignore_index=True)
    return df

In [22]:
def rate_classifiers_for_data(features, targets, classifiers, verbose=10, pipeline_steps=base_steps):
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=0)
    df = pd.DataFrame(columns=["name", "pca", "accuracy","balanced_accuracy", "f1_micro", "f1_macro", "params"])
    for (classifier, params) in classifiers:
        result = rate_classifier(X_train, y_train, X_test, y_test, classifier, params, verbose, pipeline_steps)
        df = df.append({"name": classifier[0], **result, "pca": result["params"]["pca__n_components"]}, ignore_index=True)
    return df

In [23]:
def rate_dataset(dataframe, regressors, verbose=2):
    print(f"Rating dataset of len {len(dataframe)}")
    features, targets, _ = prepare_dataframe(dataframe[:10000])
    features = preprocessor.fit_transform(features)
    rate_data(features, targets, regressors, verbose)

In [24]:
basic_regressors = [
    (knn, knn_params),
    (dtr, dtr_params),
    (lasso, lasso_params),
    (mlp_regressor, mlp_params),
    (en, en_params),
    (svr, svr_params),
]

In [25]:
def simple_experiment():
    print("Rating jobs datasets")
    for dataset in dfs_for_jobs:
        print(dataset.iloc[0]["name"])
        rate_dataset(dataset, basic_regressors)

    print("Rating common datasets")
    for dataset in datasets:
        rate_dataset(dataset, basic_regressors)

In [26]:
# rate_dataset(dfs_for_jobs[8], verbose=0)
# rate_dataset(dfs_for_jobs[1], basic_regressors, verbose=0)

## Eksperyment 4

### Cel

Zmierzyć skuteczności najlepszych pipelinów dla każdego joba, zobaczyć czy warto schodzić w dół pod wzgledem błędów

### Dane

In [27]:
full_df = load_data().dropna(axis="columns")
raw_datasets = { x:pd.DataFrame(y) for x, y in full_df.groupby('name', as_index=False)}
datasets_split = {x:train_test_split(df, random_state=0, train_size=0.75) for x,df in raw_datasets.items()}

In [28]:
def get_numerical_pipeline_data_big(data, resources=["read_sum", "write_sum", "cpu_mean", "memory_max"]):
    """
    Raw data enhanced with resource utilization quantile scores, but scores are assigned - not predicted
    """
    features, labels, dropped = prepare_dataframe(data)
    for resource in resources:
        features[resource] = dropped[resource].map(lambda value: calculate_quantile_rank(dropped[resource], value))
    features = make_regression_preprocessor(resources).fit_transform(features)
    return pd.DataFrame(features, index=labels.index), labels

In [29]:
def get_symbolic_regression_data(data, resources=["read_sum", "write_sum", "cpu_mean", "memory_max"]):
    features, labels, dropped = prepare_dataframe(data)
    for resource in resources:
        features[resource] = dropped[resource].map(lambda value: calculate_quantile_rank(dropped[resource], value))
    for list_feature in list_features:
        features[list_feature] = features[list_feature].map(lambda list_val: vectorize_list(list_val))
    return pd.DataFrame(features, index=labels.index), labels

In [30]:
def get_categorical_pipeline_data_big(data, resources=["read_sum", "write_sum", "cpu_mean", "memory_max"]):
    features, labels, dropped = prepare_dataframe(data)
    for resource in resources:
        features[resource] = dropped[resource].map(lambda value: calculate_utilization_bucket(dropped[resource], value, num_buckets=8))
    features = make_classifying_preprocessor(resources).fit_transform(features)
    return pd.DataFrame(features, index=labels.index), labels

In [31]:
full_df.name.unique().shape

(28,)

### Przebieg


trenujemy pipeline ogólny, liczymy jego skuteczności dla każdego typu jobów. Z eksperymentu 1. - najlepszy pipeline to był:

dtr, mae, pca 71

dla każdego typu jobów trenujemy dla niego pipeline, liczymy skuteczności

In [32]:
train, test = datasets_split['add_replace']

In [33]:
def run_experiment4():
    exp4_resources = ["read_sum", "write_sum", "cpu_max", "cpu_mean", "memory_mean", "memory_max"]
        
    big_regressor = Pipeline([ # test pipeline
        ('pca', PCA(random_state=42, n_components=71)),
#         ('knn', KNeighborsRegressor(n_neighbors=11))
        ('dtr', DecisionTreeRegressor(criterion="mae", max_depth=15))
    ])
    print("Preparing data for big regressor")
    X, y = get_numerical_pipeline_data_big(full_df, exp4_resources)
    train_indices = np.concatenate([train.index for (_, (train, _)) in datasets_split.items()])
    test_indices = np.concatenate([test.index for (_, (_, test)) in datasets_split.items()])
    print(f"{len(X)} {len(y)} {len(train_indices)} {len(test_indices)}")
    X_train, X_test, y_train, y_test = X.loc[train_indices], X.loc[test_indices], y.loc[train_indices], y.loc[test_indices]
    print(f"Making split with test as {len(test_indices)/(len(test_indices) + len(train_indices))} of dataset")
    
    print("Training big regressor with train data")
    big_regressor.fit(X_train, y_train)
    
    print("Predicting with big regressor on test data")
    y_predicted = big_regressor.predict(X_test)
    
    print("Rating big regressor's overall performance")
    [r2, mae, mape, rae_score] = calculate_regression_score(y_test, y_predicted, [r2_score, mean_absolute_error, mean_absolute_percentage_error, rae])
    print(f"Scores for big regressor:")
    print(f"R2: {r2}")
    print(f"MAE: {mae}")
    print(f"MAPE: {mape}")
    print(f"RAE: {rae_score}")
    print(f"Params: {big_regressor}")
    dataframes = []
    for (job, (train, test)) in datasets_split.items():
        print(f"Comparing big regressor vs local regressor for job {job}")
        print("Preparing data for local regressor")
        joint_df = pd.concat([train, test])
        local_X, local_y = get_numerical_pipeline_data_big(joint_df, exp4_resources)
        print(f"{len(local_X)} {len(train.index)} {len(test.index)}")
        local_X_train, local_X_test, local_y_train, local_y_test = local_X.loc[train.index], local_X.loc[test.index], local_y.loc[train.index], local_y.loc[test.index]
        
        print(f"Rating local regressors for job {job}")
        regressor_df = rate_data_explicit(local_X_train, local_X_test, local_y_train, local_y_test, basic_regressors, verbose=0)
        print(regressor_df.head())
        print("Preparing data for big regressor")
        local_X_test, local_y_test = X.loc[test.index], y.loc[test.index]
        
        print(f"Rating big regressor for job {job}")
        local_predicted = big_regressor.predict(local_X_test)
        [r2, mae, mape, rae_score] = calculate_regression_score(local_predicted, local_y_test, [r2_score, mean_absolute_error, mean_absolute_percentage_error, rae])
        regressor_df = regressor_df.append({"name": "big", **{"r2": r2, "mae": mae, "mape": mape, "rae": rae_score}, "pca": "xD"}, ignore_index=True)
        regressor_df["job"] = job
        regressor_df["size"] = len(joint_df)
        dataframes.append(regressor_df)
        
    return pd.concat(dataframes)

In [34]:
rerun_4df =run_experiment4()

Preparing data for big regressor
111329 111329 83489 27840
Making split with test as 0.25006961348795015 of dataset
Training big regressor with train data
Predicting with big regressor on test data
Rating big regressor's overall performance
Scores for big regressor:
R2: 0.9687782232288124
MAE: 1756.4208404884564
MAPE: 0.21083831447114107
RAE: 0.06772082275064974
Params: Pipeline(steps=[('pca', PCA(n_components=71, random_state=42)),
                ('dtr', DecisionTreeRegressor(criterion='mae', max_depth=15))])
Comparing big regressor vs local regressor for job add_replace
Preparing data for local regressor
427 320 107
Rating local regressors for job add_replace
Rating ('knn', KNeighborsRegressor())
Evaluating grid search
Predicting on test set
Calculating scores
Calculated scores on test set
Rating ('dtr', DecisionTreeRegressor(random_state=5))
Evaluating grid search
Predicting on test set
Calculating scores
Calculated scores on test set
Rating ('lasso', Lasso(random_state=5))
Evaluat

In [37]:
rerun_4df.loc[rerun_4df.job == "add_replace"]

Unnamed: 0,name,pca,adjusted_r2,r2,mae,mape,best_score,params,rae,job,size
0,knn,26,-0.063188,0.330586,83.247706,0.120842,0.400444,"{'knn__n_neighbors': 1, 'pca__n_components': 26}",0.675916,add_replace,427
1,dtr,34,-0.404418,0.115737,98.256881,0.145331,-0.554401,"{'dtr__criterion': 'friedman_mse', 'dtr__max_d...",0.79778,add_replace,427
2,lasso,39,-0.03592,0.347754,87.358241,0.139217,0.345022,"{'lasso__alpha': 0.9600000000000001, 'pca__n_c...",0.70929,add_replace,427
3,mlp,25,-0.153411,0.273778,99.123728,0.162312,-1.559842,"{'mlp__activation': 'relu', 'mlp__hidden_layer...",0.804818,add_replace,427
4,elasticnet,24,-0.027851,0.352835,92.478178,0.151441,0.536394,"{'elasticnet__alpha': 0.41000000000000003, 'el...",0.750861,add_replace,427
5,svr,20,-0.084659,0.317066,92.81485,0.150815,0.387225,"{'pca__n_components': 20, 'svr__alpha': 0.1901...",0.753594,add_replace,427
6,big,xD,,0.983431,291.91358,0.211504,,,0.05804,add_replace,427


In [35]:
rerun_4df.to_csv("rerun4df.csv")

In [47]:
def run_experiment4_numerical_rae():
    exp4_resources = ["read_sum", "write_sum", "cpu_max", "cpu_mean", "memory_mean", "memory_max"]
        
    
    train_indices = np.concatenate([train.index for (_, (train, _)) in datasets_split.items()])
    test_indices = np.concatenate([test.index for (_, (_, test)) in datasets_split.items()])
    dataframes = []
    for (job, (train, test)) in datasets_split.items():
        print("Preparing data for local regressor")
        joint_df = pd.concat([train, test])
        local_X, local_y = get_numerical_pipeline_data_big(joint_df, exp4_resources)
        print(f"{len(local_X)} {len(train.index)} {len(test.index)}")
        local_X_train, local_X_test, local_y_train, local_y_test = local_X.loc[train.index], local_X.loc[test.index], local_y.loc[train.index], local_y.loc[test.index]
        
        print(f"Rating local regressors for job {job}")
        regressor_df = rate_data_explicit(local_X_train, local_X_test, local_y_train, local_y_test, basic_regressors, verbose=0, scoring=rae_scorer)
        print(regressor_df.head())
        regressor_df["job"] = job
        regressor_df["size"] = len(joint_df)
        dataframes.append(regressor_df)
        
    return pd.concat(dataframes)

In [48]:
def load_or_run(file, runner):
    if not os.path.isfile(file):
        print(f"Running experiment {file}")
        dataframe = runner()
        dataframe.to_csv(file)
    else:
        dataframe = pd.read_csv(file).round(2)
    return dataframe

In [49]:
N_JOBS=8

In [41]:
exp4_r2_df_incomplete = load_or_run("data/exp4_optimize_r2.csv", run_experiment4)
exp4_rae_incomplete = load_or_run("data/exp4_optimize_rae.csv", run_experiment4_numerical_rae)
# exp4_r2_df_memory_intensive = load_or_run("data/exp4_optimize_r2_memint.csv", run_experiment4_memory_intensive)
# exp4_r2_categorical_incomplete = load_or_run("data/exp4_optimize_r2_categorical.csv", run_experiment4_categorical)

In [None]:
table_df = exp4_r2_df_incomplete.drop(columns=["Unnamed: 0"])
for job in raw_datasets:
    job_df = table_df.loc[table_df.job == "sort_sam"].sort_values("mape", ascending=True)
    print(job_df.iloc[0]["mape"])

In [39]:
def export_for_symbolic_regression():
    exp4_resources = ["read_sum", "write_sum", "cpu_max", "cpu_mean", "memory_mean", "memory_max"]
            
    train_indices = np.concatenate([train.index for (_, (train, _)) in datasets_split.items()])
    test_indices = np.concatenate([test.index for (_, (_, test)) in datasets_split.items()])
    for (job, (train, test)) in datasets_split.items():
        print(f"Preparing data for symbolic regressor: {job}, {len(train) + len(test)}")
        joint_df = pd.concat([train, test])
        local_X, local_y = get_symbolic_regression_data(joint_df, exp4_resources)
        local_X_train, local_X_test, local_y_train, local_y_test = local_X.loc[train.index], local_X.loc[test.index], local_y.loc[train.index], local_y.loc[test.index]
        
        joint_train = local_X_train.join(local_y_train)
        joint_test = local_X_test.join(local_y_test)

        joint_train.to_csv(f"data/symbolic/{job}_train.csv", sep="\t", index=False)
        joint_test.to_csv(f"data/symbolic/{job}_test.csv", sep="\t", index=False)

In [40]:
def export_for_symbolic_regression_single_step():
    exp4_resources = []
            
    train_indices = np.concatenate([train.index for (_, (train, _)) in datasets_split.items()])
    test_indices = np.concatenate([test.index for (_, (_, test)) in datasets_split.items()])
    for (job, (train, test)) in datasets_split.items():
        print(f"Preparing data for symbolic regressor: {job}, {len(train) + len(test)}")
        joint_df = pd.concat([train, test])
        local_X, local_y = get_symbolic_regression_data(joint_df, exp4_resources)
        local_X_train, local_X_test, local_y_train, local_y_test = local_X.loc[train.index], local_X.loc[test.index], local_y.loc[train.index], local_y.loc[test.index]
        
        joint_train = local_X_train.join(local_y_train)
        joint_test = local_X_test.join(local_y_test)

        joint_train.to_csv(f"data/symbolicRaw/{job}_train.csv", sep="\t", index=False)
        joint_test.to_csv(f"data/symbolicRaw/{job}_test.csv", sep="\t", index=False)

In [41]:
export_for_symbolic_regression()

Preparing data for symbolic regressor: add_replace, 427
Preparing data for symbolic regressor: alignment_to_reference, 427
Preparing data for symbolic regressor: bwa-index, 63
Preparing data for symbolic regressor: combine_variants, 62
Preparing data for symbolic regressor: dedup, 427
Preparing data for symbolic regressor: faidx, 63
Preparing data for symbolic regressor: filtering_indel, 62
Preparing data for symbolic regressor: filtering_snp, 62
Preparing data for symbolic regressor: genotype_gvcfs, 1259
Preparing data for symbolic regressor: haplotype_caller, 8539
Preparing data for symbolic regressor: indel_realign, 427
Preparing data for symbolic regressor: mAdd, 160
Preparing data for symbolic regressor: mBackground, 15340
Preparing data for symbolic regressor: mBgModel, 160
Preparing data for symbolic regressor: mConcatFit, 160
Preparing data for symbolic regressor: mDiffFit, 65323
Preparing data for symbolic regressor: mImgtbl, 157
Preparing data for symbolic regressor: mJPEG, 4

In [42]:
export_for_symbolic_regression_single_step()

Preparing data for symbolic regressor: add_replace, 427
Preparing data for symbolic regressor: alignment_to_reference, 427
Preparing data for symbolic regressor: bwa-index, 63
Preparing data for symbolic regressor: combine_variants, 62
Preparing data for symbolic regressor: dedup, 427
Preparing data for symbolic regressor: faidx, 63
Preparing data for symbolic regressor: filtering_indel, 62
Preparing data for symbolic regressor: filtering_snp, 62
Preparing data for symbolic regressor: genotype_gvcfs, 1259
Preparing data for symbolic regressor: haplotype_caller, 8539
Preparing data for symbolic regressor: indel_realign, 427
Preparing data for symbolic regressor: mAdd, 160
Preparing data for symbolic regressor: mBackground, 15340
Preparing data for symbolic regressor: mBgModel, 160
Preparing data for symbolic regressor: mConcatFit, 160
Preparing data for symbolic regressor: mDiffFit, 65323
Preparing data for symbolic regressor: mImgtbl, 157
Preparing data for symbolic regressor: mJPEG, 4

# Tworzenie modeli zwróconych przez regresję symboliczną

In [43]:
import math

In [180]:
sr_models = {
    "add_replace": lambda row: 1309 + 6.299e-8* row["mem.active"] -2.82e-9 * row["total_ram_available"] - 259.6 * row["cpu.speed"],
    "alignment_to_reference": lambda row: 1095 + 2291*row["cpu_mean"] + 0.882 * math.exp(10.1 * row["read_sum"]),
    "bwa-index": lambda row: 862001 + 6.991e5 * row["write_sum"] + 2.62e-5 * row["mem.slab"],
    "combine_variants": lambda row: 10816 - 6.96 * row["avg_pods"] - 470 * row["avg_cpus"] - 1532 * row["cpu.speed"],
    "dedup": lambda row: 5367 + 1713*math.tan(1.82*row["cpu_mean"] - 0.464) - 2524*row["cpu_max"],
    "faidx": lambda row: 4035 + 1887*row["write_sum"] + 1.26**row["read_sum"],
    "filtering_indel": lambda row: 7525  - 9.58e-8*row["average_ram_available"]-1133*row["cpu.speed"],
    "filtering_snp": lambda row: 5099 + 6.10e-7 * row["mem.slab"] - 226 * row["cpu.speed"] * row["avg_cpus"],
    "genotype_gvcfs": lambda row: 5.25e4 + 16562*row["args"]*row["read_sum"] - 772 * row["size"],
    "haplotype_caller": lambda row: 4.166e4 + 2.8e4 * row["read_sum"]**2,
    "indel_realign": lambda row: 7172 + 1065*row["cpu_mean"] - 1218 * row["cpu.speed"] - 1879 * row["cpu_max"],
    "mAdd": lambda row: 25893 * row["write_sum"] + 478 * (1 if row["workflowName"] == "montage2" else 0) * row["inputs"] * (1 if row["cpu.brand"] == "epyc" else 0) * row["write_sum"] - 7825,
    "mBackground": lambda row: 1187 * (1 if row["workflowName"] == "montage2" else 0) + row["write_sum"] * math.factorial(int(8.12 * row["read_sum"])),
    "mBgModel": lambda row: 4.65e4 * row["read_sum"] * row["memory_mean"] + 3.4e4 * row["size"] * row["write_sum"] - 2.8e4 * row["cpu_mean"],
    "mConcatFit": lambda row: 9.71 * row["inputs"] + 0.805 * (1.34e5) ** row["write_sum"],
    "mDiffFit": lambda row: 3.82e5 * row["cpu_max"] - 1.25e5 - row["avg_pods"] - 2.52e5 * row["cpu_mean"] ** 2,
    "merge_gcvf": lambda row: 1.15e6 + 1.89e4 * row["inputs"] * row["read_sum"] + 1.91e7 * row["write_sum"] ** 2,
    "mImgtbl": lambda row: 149 - (183 / math.log(row["read_sum"] * 1.001)) ,
    "mJPEG": lambda row: 733 - 34 * row["cpu.speed"]** 2,
    "mProject": lambda row: 12324 + 1.68e4 * row["read_sum"] + 1.23e4 ** row["read_sum"],
    "mProjectPP": lambda row: 3102 * row["write_sum"] + 2.37e-8 * row["mem.total"] - 409 - 80.64 * row["cpu.speed"] * row["avg_cpus"],
    "mShrink": lambda row: 67.5 + 500 * row["size"] * row["write_sum"] + 272 * row["size"] ** 2,
    "mViewer": lambda row: 2.55e4*row["write_sum"] - 8373,
    "realign_target_creator": lambda row: 8.19e4 + 2.26e5 * row["write_sum"] ** 2,
    "select_variants_indel": lambda row: 2.173e4 + 8.85e4 * row["read_sum"] - 3.84e4 * row["write_sum"],
    "select_variants_snp": lambda row: 2.29e4+5.01e4*row["read_sum"],
    "seq_dict": lambda row: 6.37e3 + (33.9 * row["total_cpus"]*row["write_sum"]) / row["cpu_mean"],
    "sort_sam": lambda row: 1069 + 1.87 * row["avg_pods"] - 205 * row["cpu.speed"] - 3.35e-11 * row["avg_pods"] * row["total_ram_available"]
}

In [181]:
def n(series, value):
    scaler = StandardScaler()
    scaler.fit(series.to_numpy().reshape(-1, 1))
    return scaler.transform(np.array([value]).reshape(1, -1))[0]

In [182]:
def eval_dataset(job, train, test, resources):
    print(f"Evaluating 0/1: {job}, {len(train) + len(test)}")
    joint_df = pd.concat([train, test])
    local_X, local_y = get_symbolic_regression_data(joint_df, resources)
    local_X_train, local_X_test, local_y_train, local_y_test = local_X.loc[train.index], local_X.loc[test.index], local_y.loc[train.index], local_y.loc[test.index]

    y_pred = np.array([sr_models[job](row) for index, row in local_X_test.iterrows()])
    y_true = local_y_test
    r2, mae, mape, rae = calculate_regression_score(y_true, y_pred)
    adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    print(f"Evaluating 1/1: {job}")
    return {"name": "symbolic", "pca": None, "r2": r2, "adjusted_r2": adjusted_r2, "mae": mae, "mape": mape, "rae": rae,"best_score": r2, "params": None, "job": job, "size": len(joint_df)}

In [183]:
def eval_all_d(true, pred, scores=[r2_score, mean_absolute_error, mean_absolute_percentage_error, rae]):
    executor = get_reusable_executor(max_workers=4)
    results = executor.map(lambda fun: fun(true, pred), scores)
    return results

In [184]:
def evaluate_symbolic_regression():
    exp4_resources = ["read_sum", "write_sum", "cpu_max", "cpu_mean", "memory_mean", "memory_max"]
    results_df = pd.DataFrame(columns = ['name', 'pca', 'adjusted_r2', 'r2', 'mae', 'mape','best_score', 'params', 'rae', 'job', 'size'])
    executor = get_reusable_executor(max_workers=12)
    results = executor.map(lambda item: eval_dataset(item[0], item[1][0], item[1][1], exp4_resources), datasets_split.items())
    for result in results:
        results_df = results_df.append(result, ignore_index=True)
    return results_df

In [187]:
symbolic_results = evaluate_symbolic_regression()

In [192]:
exp4_r2_df_incomplete.mape.mean()

0.9436734693877551

### Wyniki

Wykresy:
- najlepsza skuteczność regresora (min mape/rae) vs liczba sampli
- skuteczności 5 najlepszych regresorów dla każdego (grid chart)
- najlepsza skuteczność regresora dla joba vs skuteczność dużego regresora dla tego joba

Odpowiedzi:
- czy zwiększenie granularności jest sensowne?
- czy jest jakiś widoczny próg liczby sampli przy zwiększonej granularności?