In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import math
from xgboost import XGBRegressor
import sklearn.metrics as s_m
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope
from sklearn.model_selection import GridSearchCV
from data_preparation import *
from util import *
from xgboost_util import *

In [3]:
df = load_preprocessed_dataset(remove_duplicates=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1638 entries, 1909 to 768
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   storageRegion        1638 non-null   object        
 1   storageProvider      1638 non-null   object        
 2   functionId           1638 non-null   object        
 3   functionName         1638 non-null   object        
 4   functionType         1638 non-null   object        
 5   RTT                  1638 non-null   float64       
 6   loopCounter          1638 non-null   float64       
 7   maxLoopCounter       1638 non-null   float64       
 8   startTime            1638 non-null   datetime64[ns]
 9   endTime              1638 non-null   datetime64[ns]
 10  upAll                1638 non-null   float64       
 11  downAll              1638 non-null   float64       
 12  numberDownloadFiles  1638 non-null   int64         
 13  sizeDownloadInMB     1638 non-null  

In [5]:
#df[(df['wfType'] == 'bwa') & (df['functionProvider'] == 'GCP') & (df['storageRegion'] == 'northVirginia') & (df['functionName'] == 'bwaMerge') & (df['functionRegion'] == 'centralEurope')]

In [6]:
network_related_cols = ['numberDownloadFiles', 'sizeDownloadInMB', 'numberUploadFiles', 'sizeUploadInMB', 'storageRegionEnc', 'storageProviderEnc']
concurrency_related_cols = ['loopCounter', 'maxLoopCounter']
function_related_cols = ['functionRegionEnc', 'functionProviderEnc', 'functionNameEnc', 'functionTypeEnc', 'wfTypeEnc']
time_related_cols = ['dayofweek', 'timeofday']

input_cols = network_related_cols + function_related_cols
output_col_rtt = 'RTT'
output_col_ct = 'ct'
group_col = 'kFoldGroupEnc'

In [9]:
df_temp = df[(df['functionName'] != 'genomePreparePop') & (df['functionName'] != 'genomeIndividualsMerge')]
X_train, y_train, groups_train, X_test, y_test, _, df_test = train_test_split_with_criterion(lambda x: (x['wfType'] == 'genome' and x['functionProvider'] == 'AWS'), df_temp, input_cols, output_col_rtt, group_col)

In [23]:
#X_train, y_train, group_train, X_test, y_test = train_test_split_with_criterion(lambda x: (x['functionName'] == 'bwaAlnR1' and x['provider'] == 'AWS'), df, input_cols, output_col)

In [24]:
#X_train, y_train, group_train, X_test, y_test = train_test_split_with_criterion(lambda x: x['kFoldGroupEnc'] == 30, df, input_cols, output_col)

In [51]:
def hyperopt_objective_tree(space: dict):
    custom_splitter = CousinCrossValidation.split(X=X_train, y=y_train, groups=groups_train)
    model = XGBRegressor()

    for k, v in space.items():
        space[k] = [v]

    search = GridSearchCV(estimator=model,
                          param_grid=space,
                          scoring="neg_root_mean_squared_error",
                          cv=custom_splitter,
                          verbose=0,
                          return_train_score=False)

    print(space)

    search.fit(X=X_train, y=y_train, groups=groups_train)
    return {'loss': -1.0 * search.best_score_, 'status': STATUS_OK}

In [7]:
space = hp.choice('classifier_type', [
    {
        'booster': 'gbtree',
        'max_depth': scope.int(hp.quniform('max_depth', 3, 18, 1)),
        'gamma': hp.uniform('gamma', 1, 9),
        'eta': hp.uniform('eta', 0.2, 0.5),
        'reg_alpha': hp.uniform('reg_alpha', 0, 4),
        'reg_lambda': hp.uniform('reg_lambda', 0, 4),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
        'min_child_weight': scope.int(hp.quniform('min_child_weight', 0, 10, 1)),
    },
    #{ overfitting hell
    #    'booster': 'gblinear',
    #    'reg_lambda': hp.uniform('lin_reg_lambda', 0, 4),
    #    'reg_alpha': hp.uniform('lin_reg_alpha', 0, 4),
    #},
    {
        'booster': 'dart',
        'max_depth': scope.int(hp.quniform('max_depth_', 3, 18, 1)),
        'gamma': hp.uniform('gamma_', 1, 9),
        'eta': hp.uniform('eta_', 0.2, 0.5),
        'sample_type': hp.choice('sample_type', ['uniform', 'weighted']),
        'rate_drop': hp.uniform('rate_drop', 0, 1),
        'one_drop': hp.choice('one_drop', [0, 1]),
        'skip_drop': hp.uniform('skip_drop', 0, 1)
    }
])

trials = Trials()
fmin_result = fmin(fn=hyperopt_objective_tree, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
opt_params = space_eval(space, fmin_result)
opt_params

NameError: name 'hyperopt_objective_tree' is not defined

In [10]:
model = XGBRegressor(**get_ideal_params())
#model = XGBRegressor()
# define model evaluation method
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
#scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score="raise")
# force scores to be positive

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = s_m.mean_squared_error(y_test, predictions)
ase = s_m.mean_absolute_error(y_test, predictions)
mape = s_m.mean_absolute_percentage_error(y_test, predictions)

print('Mean MSE: %.3f \nMean ASE: %.3f \nMean APE: %.3f' % (mse, ase, mape))

Mean MSE: 32.417 
Mean ASE: 3.905 
Mean APE: 0.188


In [11]:
df_test['mape'] = np.abs(y_test - predictions) / y_test
df_test['pred'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['mape'] = np.abs(y_test - predictions) / y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['pred'] = predictions


In [12]:
df_test.sort_values(by="mape", ascending=False)

Unnamed: 0,storageRegion,storageProvider,functionId,functionName,functionType,RTT,loopCounter,maxLoopCounter,startTime,endTime,...,functionNameEnc,functionTypeEnc,wfTypeEnc,dayofweek,timeofday,ct,datatransferTime,kFoldGroupEnc,mape,pred
265,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeSifting,genomeSiftingType,3.906,-1.0,-1.0,2023-04-07 13:10:45.169,2023-04-07 13:10:49.075,...,12,12,1,4,47445,3.111,0.795,49,0.689943,6.600918
248,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeMutualOverlap,genomeMutualOverlapType,3.372,6.0,6.0,2023-04-07 13:08:23.528,2023-04-07 13:08:26.900,...,10,10,1,4,47303,2.569,0.803,41,0.668135,5.624953
249,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeMutualOverlap,genomeMutualOverlapType,3.385,3.0,6.0,2023-04-07 13:08:23.523,2023-04-07 13:08:26.908,...,10,10,1,4,47303,2.540,0.845,41,0.661729,5.624953
250,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeMutualOverlap,genomeMutualOverlapType,3.400,0.0,6.0,2023-04-07 13:08:23.521,2023-04-07 13:08:26.921,...,10,10,1,4,47303,2.577,0.823,41,0.654398,5.624953
287,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeSifting,genomeSiftingType,3.993,-1.0,-1.0,2023-04-07 13:13:15.058,2023-04-07 13:13:19.051,...,12,12,1,4,47595,3.187,0.806,49,0.653122,6.600918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,northVirginia,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeSifting,genomeSiftingType,6.888,-1.0,-1.0,2023-04-07 13:26:07.975,2023-04-07 13:26:14.863,...,12,12,1,4,48367,3.228,3.660,49,0.005426,6.850627
236,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeFrequency,genomeFrequencyType,142.327,5.0,6.0,2023-04-07 13:05:50.491,2023-04-07 13:08:12.818,...,7,7,1,4,47150,140.965,1.362,29,0.003959,142.890472
237,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeFrequency,genomeFrequencyType,142.473,3.0,6.0,2023-04-07 13:05:50.490,2023-04-07 13:08:12.963,...,7,7,1,4,47150,141.208,1.265,29,0.002930,142.890472
238,northernEurope,AWS,arn:aws:lambda:eu-west-2:717556240325:function...,genomeFrequency,genomeFrequencyType,142.648,2.0,6.0,2023-04-07 13:05:50.490,2023-04-07 13:08:13.138,...,7,7,1,4,47150,141.184,1.464,29,0.001700,142.890472
