In [1]:
import copy

import torch
import torch.nn as nn
import torch.optim as optim
import tqdm

from data_preparation import *
from util import *

In [2]:
df = load_preprocessed_dataset(remove_duplicates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1638 entries, 1909 to 768
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   storageRegion        1638 non-null   object        
 1   storageProvider      1638 non-null   object        
 2   functionId           1638 non-null   object        
 3   functionName         1638 non-null   object        
 4   functionType         1638 non-null   object        
 5   RTT                  1638 non-null   float64       
 6   loopCounter          1638 non-null   float64       
 7   maxLoopCounter       1638 non-null   float64       
 8   startTime            1638 non-null   datetime64[ns]
 9   endTime              1638 non-null   datetime64[ns]
 10  upAll                1638 non-null   float64       
 11  downAll              1638 non-null   float64       
 12  numberDownloadFiles  1638 non-null   int64         
 13  sizeDownloadInMB     1638 non-null  

In [3]:
input_cols = get_function_related_cols()
output_col_rtt = 'RTT'
group_col = 'kFoldGroupEnc'

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def measure(model, X_train, y_train, X_test, y_test):
    torch.manual_seed(0)
    np.random.seed(0)
    model.to(device=device)
    loss_fn = nn.MSELoss()  # mean square error
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    n_epochs = 500  # number of epochs to run
    batch_size = 10  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)

    # Hold the best model
    best_mse = np.inf  # init to infinity
    best_weights = None
    history = []
    torch_X_train = torch.tensor(X_train, dtype=torch.float32, device=device)
    torch_y_train = torch.tensor(y_train, dtype=torch.float32, device=device).reshape(-1, 1)
    torch_X_test = torch.tensor(X_test, dtype=torch.float32, device=device)
    torch_y_test = torch.tensor(y_test, dtype=torch.float32, device=device).reshape(-1, 1)

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = torch_X_train[start:start + batch_size]
                y_batch = torch_y_train[start:start + batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                bar.set_postfix(mse=float(loss))
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(torch_X_test)
        mse = loss_fn(y_pred, torch_y_test)
        mse = float(mse)
        history.append(mse)
        if mse < best_mse:
            best_mse = mse
            best_weights = copy.deepcopy(model.state_dict())

    # restore model and return best accuracy
    y_pred = model(torch_X_test)


    rmse = np.sqrt(s_m.mean_squared_error(torch_y_test.detach().cpu().numpy(),
                                  y_pred.detach().cpu()))

    mae = s_m.mean_absolute_error(torch_y_test.detach().cpu().numpy(), y_pred.detach().cpu().numpy())
    mape = s_m.mean_absolute_percentage_error(torch_y_test.detach().cpu().numpy(),
                                                            y_pred.detach().cpu().numpy())
    return rmse, mae, mape

In [5]:
class NN(nn.Module):

    def append_linear_layer_stack(self, in_size, out_size, activation, dropout):
        new_layer = nn.Sequential(nn.Linear(in_size, out_size),
                                  activation,
                                  nn.Dropout1d(0.1) if dropout else nn.Identity())
        self.layer_stack.append(new_layer)

    def __init__(self, activation, hidden_width=200, hidden_depth = 2, dropout=False, input_dim=len(input_cols)):
        super(NN, self).__init__()

        self.layer_stack = []

        self.append_linear_layer_stack(input_dim, hidden_width, activation, dropout)
        for _ in range(hidden_depth - 1):
            self.append_linear_layer_stack(hidden_width, hidden_width, activation, dropout)

        self.layer_stack.append(nn.Linear(hidden_width, 1))

        self.layers = nn.Sequential(*self.layer_stack)

    def forward(self, x):
        return self.layers.forward(x)


In [12]:
def ablation_study_ann50(input_cols):
    X_train, y_train, groups_train, X_test, y_test, _, df_test = train_test_split_with_criterion(lambda x: (x['wfType'] == 'bwa' and x['functionProvider'] == 'AWS'), df, input_cols, output_col_rtt, group_col)

    rmse, mae, mape = measure(NN(nn.ReLU(), hidden_width=50, hidden_depth=1, dropout=True, input_dim=len(input_cols)),
                              X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    print('%.2f & %.2f & %.2f \\\\' % (rmse, mae, mape))

def ablation_study_ann10(input_cols):
    X_train, y_train, groups_train, X_test, y_test, _, df_test = train_test_split_with_criterion(lambda x: (x['wfType'] == 'bwa' and x['functionProvider'] == 'AWS'), df, input_cols, output_col_rtt, group_col)

    rmse, mae, mape = measure(NN(nn.Sigmoid(), hidden_width=10, hidden_depth=1, dropout=True, input_dim=len(input_cols)),
                              X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    print('%.2f & %.2f & %.2f \\\\' % (rmse, mae, mape))

In [7]:
ablation_study_ann50(get_function_related_cols())

10.78 & 8.66 & 1.76 \\


In [8]:
ablation_study_ann50(get_function_related_cols() + get_storage_related_cols())

6.56 & 5.47 & 0.98 \\


In [9]:
ablation_study_ann50(get_function_related_cols() + get_storage_related_cols() + get_concurrency_related_cols())

4.44 & 3.81 & 0.49 \\


In [11]:
ablation_study_ann50(get_function_related_cols() + get_storage_related_cols() + get_time_related_cols())

7.11 & 6.09 & 0.94 \\


In [13]:
ablation_study_ann10(get_function_related_cols())
ablation_study_ann10(get_function_related_cols() + get_storage_related_cols())
ablation_study_ann10(get_function_related_cols() + get_storage_related_cols() + get_concurrency_related_cols())
ablation_study_ann10(get_function_related_cols() + get_storage_related_cols() + get_time_related_cols())

11.22 & 9.02 & 1.89 \\
8.83 & 7.62 & 1.43 \\
9.40 & 8.04 & 1.52 \\
8.34 & 7.23 & 1.39 \\
