# Execution time prediction model

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [6]:
def data_preprocessing(data, features_to_use):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.loc[:, features_to_use]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))
    memory_target = data.iloc[:, -2]
    time_target = data.iloc[:, -1]
    return features, memory_target, time_target


# Load and preprocess data
data = pd.read_csv('data/training_data.csv')
features_to_use = [
    'binary_size_bytes','data_section_size_bytes','function_count',
    'high_complexity_functions','linear_memory_bytes','instance_count',
    'resource_count','is_ml_workload','model_file_size','request_payload_size'
]
df_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
features, memory_target, time_target = data_preprocessing(df_shuffled, features_to_use)


In [7]:
df_shuffled.head()

Unnamed: 0,binary_name,binary_size_bytes,data_section_size_bytes,import_count,export_count,function_count,global_variable_count,type_definition_count,instance_count,resource_count,...,avg_local_variables_per_function,high_complexity_functions,linear_memory_bytes,total_function_references,is_ml_workload,request_payload_size,model_file_size,payload,memory_kb,task_duration_sec
0,image_classification_resnet_onnx_batch.cwasm,1230768,22,107,211,985,4,1326,41,0,...,5.3684,51,1114112,0,1,297742,46782752,297742,216992,1.424655
1,matrix_transpose.cwasm,0,38,53,102,584,4,693,23,0,...,4.3864,21,1114112,0,0,1834603,0,1834603,51040,0.315899
2,image_classification_resnet_onnx_batch.cwasm,1230768,22,107,211,985,4,1326,41,0,...,5.3684,51,1114112,0,1,1790942,46782752,1790942,225968,1.63
3,fibonacci_optimized.cwasm,689248,38,58,110,606,4,714,25,0,...,4.4129,22,1114112,0,0,9,0,30,25888,0.228653
4,matrix_multiplication_component.cwasm,672200,38,53,102,585,4,694,23,0,...,4.4189,21,1114112,0,0,1337532,0,1337532,39040,0.337054


## Utils

In [23]:
def rmse_denormalized(y_true, y_pred_normalized, target_scaler):    
    # Denormalize predictions and true values
    y_true_denorm = target_scaler.inverse_transform(y_true.reshape(-1, 1)).ravel()
    y_pred_denorm = target_scaler.inverse_transform(y_pred_normalized.reshape(-1, 1)).ravel()
    
    # Calculate denormalized RMSE
    mse_denorm = mean_squared_error(y_true_denorm, y_pred_denorm)
    denormalized_rmse = np.sqrt(mse_denorm)
    return denormalized_rmse

def rmse(y_true, y_pred_normalized):    
    # Calculate denormalized RMSE
    mse_denorm = mean_squared_error(y_true, y_pred_normalized)
    denormalized_rmse = np.sqrt(mse_denorm)
    return denormalized_rmse

## Training 

In [9]:
# Fit scalers on training data (correct approach)
features_scaler = StandardScaler().fit(features.to_numpy())
target_scaler = StandardScaler().fit(time_target.to_numpy().reshape(-1, 1))

# Transform training data
X = features_scaler.transform(features.to_numpy())
y = target_scaler.transform(time_target.to_numpy().reshape(-1, 1)).ravel()

print("Training data shape:", X.shape)
print("Target scaler mean:", target_scaler.mean_[0])
print("Target scaler scale:", target_scaler.scale_[0])

Training data shape: (1039, 10)
Target scaler mean: 1.6361656818700674
Target scaler scale: 7.386948263760952


In [10]:
# Initialize SGD model
sgd_model = SGDRegressor(
    loss='squared_error',
    max_iter=1000,
    learning_rate='adaptive',
    eta0=0.01,
    random_state=42
)

print("SGD model initialized")
print(sgd_model)


SGD model initialized
SGDRegressor(learning_rate='adaptive', random_state=42)


### Dynamic training experiment

In [11]:
def train_model(model, X, y, epochs=1):
    """
    Train the model on the given batch of rows with partial fit
    Args:
        model: SGDRegressor model
        X: Array of feature rows (2D array)
        y: Array of target values (1D array)
    """
    # Use partial_fit for online learning
    for _ in range(epochs):
        model.partial_fit(X, y)
    return model

def predict_next_rows(model, rows):
    """
    Evaluate the model on the rows producing both an RMSE normalized and denormalized
    """
    if len(rows) == 0:
        return 0.0, 0.0
    
    # Get true values for these rows
    y_true = y[index+1:index+1+len(rows)]
    
    # Predict on normalized data
    y_pred_normalized = model.predict(rows)
    
    # Calculate normalized RMSE
    mse_normalized = mean_squared_error(y_true, y_pred_normalized)
    normalized_rmse = np.sqrt(mse_normalized)
    
    # Denormalize predictions and true values
    y_true_denorm = target_scaler.inverse_transform(y_true.reshape(-1, 1)).ravel()
    y_pred_denorm = target_scaler.inverse_transform(y_pred_normalized.reshape(-1, 1)).ravel()
    
    # Calculate denormalized RMSE
    mse_denorm = mean_squared_error(y_true_denorm, y_pred_denorm)
    denormalized_rmse = np.sqrt(mse_denorm)
    
    return normalized_rmse, denormalized_rmse

counter = 0
for index in range(1, len(X)):
    sgd_model = train_model(sgd_model, X[0: index], y[0: index])
    counter += 1
    rmse, denormalized_rmse=predict_next_rows(sgd_model,X[index+1:])
    if counter % 100 == 0:
        print(f"After {counter} training samples: \n RMSE: {rmse}, Denormalized RMSE: {denormalized_rmse}")


After 100 training samples: 
 RMSE: 0.9923179277010701, Denormalized RMSE: 7.330201193130286
After 200 training samples: 
 RMSE: 0.9643526836907568, Denormalized RMSE: 7.123623382442651
After 300 training samples: 
 RMSE: 0.9863975022619893, Denormalized RMSE: 7.286467316712342
After 400 training samples: 
 RMSE: 1.0402876990828975, Denormalized RMSE: 7.6845514125522865
After 500 training samples: 
 RMSE: 1.125380924976538, Denormalized RMSE: 8.313130669825133
After 600 training samples: 
 RMSE: 1.0981428198193337, Denormalized RMSE: 8.111924196225983
After 700 training samples: 
 RMSE: 1.1733495815932966, Denormalized RMSE: 8.667472654535242
After 800 training samples: 
 RMSE: 1.1364372661345434, Denormalized RMSE: 8.394803289945807
After 900 training samples: 
 RMSE: 1.10981052364153, Denormalized RMSE: 8.198112920717435
After 1000 training samples: 
 RMSE: 1.7379615671444568, Denormalized RMSE: 12.838232180901006


## Cross-Validation Evaluation

In [12]:
# Perform cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
cv_rmse_scores = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print("num training samples", len(train_idx))
    print("num validation samples", len(val_idx))
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    # Train model
    model_fold = SGDRegressor(
        loss='squared_error',
        max_iter=1000,
        learning_rate='adaptive',
        eta0=0.01,
        random_state=42
)
    
    # Use partial_fit for online learning (multiple passes)
    n_passes = 10  # Number of epochs
    for epoch in range(n_passes):
        model_fold.partial_fit(X_train_fold, y_train_fold)

    
    # Predict and evaluate
    y_pred_fold = model_fold.predict(X_val_fold)
    
    mse = mean_squared_error(y_val_fold, y_pred_fold)
    mae = mean_absolute_error(y_val_fold, y_pred_fold)
    
    # Inverse transform to get actual values
    y_val_actual = target_scaler.inverse_transform(y_val_fold.reshape(-1, 1)).ravel()
    y_pred_actual = target_scaler.inverse_transform(y_pred_fold.reshape(-1, 1)).ravel()
    
    mse_actual = mean_squared_error(y_val_actual, y_pred_actual)
    rmse_actual = rmse_denormalized(y_val_fold, y_pred_fold, target_scaler)
    cv_scores.append(mse_actual)
    cv_rmse_scores.append(rmse_actual)
    
    print(f"Fold {fold + 1}: MSE={mse_actual:.2f}, RMSE={rmse_actual:.2f}")

print(f"\nCross-Validation Results:")
print(f"Mean MSE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
print(f"Mean MAE: {np.mean(cv_rmse_scores):.2f} ± {np.std(cv_rmse_scores):.2f}")


num training samples 831
num validation samples 208
Fold 1: MSE=25.86, RMSE=5.08
num training samples 831
num validation samples 208
Fold 2: MSE=117.48, RMSE=10.84
num training samples 831
num validation samples 208
Fold 3: MSE=45.64, RMSE=6.76
num training samples 831
num validation samples 208
Fold 4: MSE=44.56, RMSE=6.68
num training samples 832
num validation samples 207
Fold 5: MSE=37.59, RMSE=6.13

Cross-Validation Results:
Mean MSE: 54.22 ± 32.40
Mean MAE: 7.10 ± 1.96


## Final Model Training


In [14]:
# Train final model on all training data
print("Training final SGD model on all training data...")
# Train using partial_fit for online learning
n_passes = 10  # Number of epochs
for epoch in range(n_passes):
    sgd_model.partial_fit(X, y)
print(f"Training completed with {n_passes} epochs!")
print("Model training completed!")

# Evaluate on training data
y_train_pred = sgd_model.predict(X)

# Inverse transform to get actual values
y_train_actual = target_scaler.inverse_transform(y.reshape(-1, 1)).ravel()
y_train_pred_actual = target_scaler.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()

train_mse = mean_squared_error(y_train_actual, y_train_pred_actual)
rmse = rmse_denormalized(y, y_train_pred, target_scaler   )

print(f"\nTraining Results:")
print(f"MSE: {train_mse:.2f}")
print(f"RMSE: {rmse:.2f}")


Training final SGD model on all training data...
Training completed with 10 epochs!
Model training completed!

Training Results:
MSE: 50.43
RMSE: 7.10


## Save Model


In [15]:
# Save the model and scalers
import os
os.makedirs('model', exist_ok=True)

joblib.dump(sgd_model, 'time_model/time_model_sgd.pkl')
joblib.dump(features_scaler, 'time_model/scaler_x_time.pkl')
joblib.dump(target_scaler, 'time_model/scaler_y_time.pkl')



['time_model/scaler_y_time.pkl']

# Testing Data

In [24]:
import joblib

# Load a model
model = joblib.load('time_model/time_model_sgd.pkl')

# Load scalers
scaler_x = joblib.load('time_model/scaler_x_time.pkl')
scaler_y = joblib.load('time_model/scaler_y_time.pkl')

data = pd.read_csv('data/testing_data.csv')
features_to_use = [
    'binary_size_bytes','data_section_size_bytes','function_count',
    'high_complexity_functions','linear_memory_bytes','instance_count',
    'resource_count','is_ml_workload','model_file_size','request_payload_size'
]
df_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
features, memory_target, time_target = data_preprocessing(df_shuffled, features_to_use)

# Transform test data
X = scaler_x.transform(features.to_numpy())
y = scaler_y.transform(time_target.to_numpy().reshape(-1, 1)).ravel()

print("Testing data shape:", X.shape)
print("Testing scaler mean:", scaler_y.mean_[0])
print("Testing scaler scale:", scaler_y.scale_[0])

# Evaluate on training data
y_train_pred = model.predict(X)

# Inverse transform to get actual values
y_train_actual = scaler_y.inverse_transform(y.reshape(-1, 1)).ravel()
y_train_pred_actual = scaler_y.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()

test_mse = mean_squared_error(y_train_actual, y_train_pred_actual)
rmse = rmse(y, y_train_pred)
rmse_denormalized = rmse_denormalized(y, y_train_pred, target_scaler)

print(f"Testing Results:")
print(f"MSE: {test_mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RMSE Denormalized: {rmse_denormalized:.2f}")

Testing data shape: (150, 10)
Testing scaler mean: 1.6361656818700674
Testing scaler scale: 7.386948263760952
Testing Results:
MSE: 48.86
RMSE: 0.95
RMSE Denormalized: 6.99
