In [1]:
from util import *
from data_preparation import *
from hyperopt import hp
from hyperopt.pyll import scope
from xgboost_util import *

In [2]:
df = load_preprocessed_dataset(remove_duplicates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1638 entries, 1909 to 768
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   storageRegion        1638 non-null   object        
 1   storageProvider      1638 non-null   object        
 2   functionId           1638 non-null   object        
 3   functionName         1638 non-null   object        
 4   functionType         1638 non-null   object        
 5   RTT                  1638 non-null   float64       
 6   loopCounter          1638 non-null   float64       
 7   maxLoopCounter       1638 non-null   float64       
 8   startTime            1638 non-null   datetime64[ns]
 9   endTime              1638 non-null   datetime64[ns]
 10  upAll                1638 non-null   float64       
 11  downAll              1638 non-null   float64       
 12  numberDownloadFiles  1638 non-null   int64         
 13  sizeDownloadInMB     1638 non-null  

In [21]:
input_cols = get_function_related_cols() + get_storage_related_cols()
output_col = 'RTT'
group_col = 'kFoldGroupEnc'

In [22]:
X_train, y_train, groups_train, X_test, y_test, _, df_test = train_test_split_with_criterion(lambda x: (x['wfType'] == 'bwa' and x['functionProvider'] == 'AWS'), df, input_cols, output_col, group_col)

Plain Model

In [23]:
model = XGBRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print_metrics(y_test, predictions)

RMSE: 1.890 
MAE: 1.400 
MAPE: 0.124


Hyperparameter Tuning

In [11]:
space = hp.choice('classifier_type', [
    {
        'booster': 'gbtree',
        'max_depth': scope.int(hp.quniform('max_depth', 3, 18, 1)),
        'gamma': hp.uniform('gamma', 1, 9),
        'eta': hp.uniform('eta', 0.2, 0.5),
        'reg_alpha': hp.uniform('reg_alpha', 0, 4),
        'reg_lambda': hp.uniform('reg_lambda', 0, 4),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
        'min_child_weight': scope.int(hp.quniform('min_child_weight', 0, 10, 1)),
    },
    #{ overfitting hell
    #    'booster': 'gblinear',
    #    'reg_lambda': hp.uniform('lin_reg_lambda', 0, 4),
    #    'reg_alpha': hp.uniform('lin_reg_alpha', 0, 4),
    #},
    {
        'booster': 'dart',
        'max_depth': scope.int(hp.quniform('max_depth_', 3, 18, 1)),
        'gamma': hp.uniform('gamma_', 1, 9),
        'eta': hp.uniform('eta_', 0.2, 0.5),
        'sample_type': hp.choice('sample_type', ['uniform', 'weighted']),
        'rate_drop': hp.uniform('rate_drop', 0, 1),
        'one_drop': hp.choice('one_drop', [0, 1]),
        'skip_drop': hp.uniform('skip_drop', 0, 1)
    }
])
opt_params = invoke_hyperopt(space, X_train, y_train, groups_train, num_tries=100)
opt_params

{'booster': ['dart'], 'eta': [0.4191075130890015], 'gamma': [6.174348587104987], 'max_depth': [12], 'one_drop': [1], 'rate_drop': [0.06887474126065463], 'sample_type': ['weighted'], 'skip_drop': [0.22974109566472978]}
{'booster': ['gbtree'], 'colsample_bytree': [0.8865997020108393], 'eta': [0.3032149225644467], 'gamma': [7.586262467552712], 'max_depth': [10], 'min_child_weight': [2], 'reg_alpha': [0.25515627334785584], 'reg_lambda': [0.8785741765024575]}
{'booster': ['gbtree'], 'colsample_bytree': [0.6311288953770593], 'eta': [0.41994698427454474], 'gamma': [2.69459186133526], 'max_depth': [5], 'min_child_weight': [8], 'reg_alpha': [0.30292207346415534], 'reg_lambda': [1.777599804005693]}
{'booster': ['dart'], 'eta': [0.389997314901093], 'gamma': [6.029405690821896], 'max_depth': [6], 'one_drop': [1], 'rate_drop': [0.5567183556403015], 'sample_type': ['weighted'], 'skip_drop': [0.45963478545539016]}
{'booster': ['gbtree'], 'colsample_bytree': [0.760831787208631], 'eta': [0.278180771669

{'booster': 'dart',
 'eta': 0.29436931168604835,
 'gamma': 6.371289473173438,
 'max_depth': 6,
 'one_drop': 0,
 'rate_drop': 0.8163974793036589,
 'sample_type': 'uniform',
 'skip_drop': 0.036468243999770014}

In [10]:
model = XGBRegressor(**get_ideal_params())
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print_metrics(y_test, predictions)

RMSE: 1.435 
MAE: 1.066 
MAPE: 0.105


In [11]:
df_test['prediction'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = predictions


In [12]:
df_test.to_csv(os.path.join(get_data_dir(), "rq1_xgboost_tuned_predictions.csv"))

In [13]:
model.save_model(os.path.join(get_data_dir(), "rq1_xgboost_tuned_model.json"))