In [5]:
from util import *
from data_preparation import *
from hyperopt import hp
from hyperopt.pyll import scope
from xgboost_util import *
import sklearn.neighbors as s_n

In [6]:
df = load_preprocessed_dataset(remove_duplicates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1638 entries, 1909 to 768
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   storageRegion        1638 non-null   object        
 1   storageProvider      1638 non-null   object        
 2   functionId           1638 non-null   object        
 3   functionName         1638 non-null   object        
 4   functionType         1638 non-null   object        
 5   RTT                  1638 non-null   float64       
 6   loopCounter          1638 non-null   float64       
 7   maxLoopCounter       1638 non-null   float64       
 8   startTime            1638 non-null   datetime64[ns]
 9   endTime              1638 non-null   datetime64[ns]
 10  upAll                1638 non-null   float64       
 11  downAll              1638 non-null   float64       
 12  numberDownloadFiles  1638 non-null   int64         
 13  sizeDownloadInMB     1638 non-null  

In [7]:
input_cols = get_function_related_cols() + get_storage_related_cols()
output_col = 'RTT'
group_col = 'kFoldGroupEnc'

In [8]:
X_train, y_train, groups_train, X_test, y_test, _, df_test = train_test_split_with_criterion(lambda x: (x['wfType'] == 'bwa' and x['functionProvider'] == 'AWS'), df, input_cols, output_col, group_col)

Plain Model

In [9]:
model = s_n.KNeighborsRegressor()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print_metrics(y_test, predictions)

RMSE: 1.729 
MAE: 1.311 
MAPE: 0.158


Hyperparameter Tuning

In [20]:
def hyperopt_objective_knn(space: dict):
    custom_splitter = CousinCrossValidation.split(X=X_train, y=y_train, groups=groups_train)
    model = s_n.KNeighborsRegressor()

    for k, v in space.items():
        space[k] = [v]

    search = GridSearchCV(estimator=model,
                          param_grid=space,
                          scoring="neg_root_mean_squared_error",
                          cv=custom_splitter,
                          verbose=0,
                          return_train_score=False)

    print(space)

    search.fit(X=X_train, y=y_train, groups=groups_train)
    return {'loss': -1.0 * search.best_score_, 'status': STATUS_OK}

In [25]:
space = {
        'n_neighbors': hp.choice('n_neighbors', [2, 3, 4, 5, 6, 7, 8, 9, 10]),
        'p': hp.choice('p',[1, 2, 3]),
        'weights': hp.choice('weights', ['uniform', 'distance'])
}

trials = Trials()
fmin_result = fmin(fn=hyperopt_objective_knn, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
opt_params = space_eval(space, fmin_result)
opt_params

{'n_neighbors': [3], 'p': [1], 'weights': ['distance']}
{'n_neighbors': [7], 'p': [2], 'weights': ['distance']}
{'n_neighbors': [9], 'p': [3], 'weights': ['distance']}                          
{'n_neighbors': [7], 'p': [1], 'weights': ['distance']}                          
{'n_neighbors': [8], 'p': [3], 'weights': ['uniform']}                           
{'n_neighbors': [5], 'p': [1], 'weights': ['distance']}                          
{'n_neighbors': [4], 'p': [2], 'weights': ['distance']}                          
{'n_neighbors': [3], 'p': [1], 'weights': ['uniform']}                           
{'n_neighbors': [3], 'p': [3], 'weights': ['uniform']}                           
{'n_neighbors': [3], 'p': [3], 'weights': ['distance']}                          
{'n_neighbors': [10], 'p': [1], 'weights': ['uniform']}                           
{'n_neighbors': [10], 'p': [2], 'weights': ['distance']}                          
{'n_neighbors': [3], 'p': [1], 'weights': ['distance']}           

{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

In [26]:
model = s_n.KNeighborsRegressor(**opt_params)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print_metrics(y_test, predictions)

RMSE: 1.768 
MAE: 1.405 
MAPE: 0.174


In [11]:
df_test['prediction'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = predictions


In [12]:
df_test.to_csv(os.path.join(get_data_dir(), "rq1_xgboost_tuned_predictions.csv"))

In [13]:
model.save_model(os.path.join(get_data_dir(), "rq1_xgboost_tuned_model.json"))