# Execution time prediction model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [13]:
def data_preprocessing(data, features_to_use):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.loc[:, features_to_use]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))
    memory_target = data.iloc[:, -2]
    time_target = data.iloc[:, -1]
    return features, memory_target, time_target


# Load and preprocess data
data = pd.read_csv('data/training_data.csv')
features_to_use = [
    'binary_size_bytes','data_section_size_bytes','function_count',
    'high_complexity_functions','linear_memory_bytes','instance_count',
    'resource_count','is_ml_workload','model_file_size','request_payload_size'
]
df_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
features, memory_target, time_target = data_preprocessing(df_shuffled, features_to_use)


In [14]:
df_shuffled.head()

Unnamed: 0,binary_name,binary_size_bytes,data_section_size_bytes,import_count,export_count,function_count,global_variable_count,type_definition_count,instance_count,resource_count,...,high_complexity_functions,linear_memory_bytes,stack_pointer_offset,total_function_references,is_ml_workload,request_payload_size,model_file_size,payload,memory_kb,task_duration_sec
0,matrix_multiplication_component.cwasm,672200,38,53,102,585,4,694,23,0,...,21,1114112,128,0,0,5954204,0,5954204,102368,1.037427
1,image_classification_resnet_onnx_batch.cwasm,1230768,22,107,211,985,4,1326,41,0,...,51,1114112,6160,0,1,2136542,46782752,2136542,236352,1.496032
2,fibonacci_optimized.cwasm,689248,38,58,110,606,4,714,25,0,...,22,1114112,128,0,0,9,0,40,26256,0.225334
3,matrix_transpose.cwasm,672048,38,53,102,584,4,693,23,0,...,21,1114112,128,0,0,3001035,0,3001035,59184,0.254224
4,fibonacci.cwasm,635224,38,53,102,569,4,678,23,0,...,22,1114112,128,0,0,9,0,13,26800,0.215089


## Utils

In [15]:
def rmse_denormalized(y_true, y_pred_normalized, target_scaler):    
    # Denormalize predictions and true values
    y_true_denorm = target_scaler.inverse_transform(y_true.reshape(-1, 1)).ravel()
    y_pred_denorm = target_scaler.inverse_transform(y_pred_normalized.reshape(-1, 1)).ravel()
    
    # Calculate denormalized RMSE
    mse_denorm = mean_squared_error(y_true_denorm, y_pred_denorm)
    denormalized_rmse = np.sqrt(mse_denorm)
    return denormalized_rmse

## Training 

In [16]:
# Fit scalers on training data (correct approach)
features_scaler = StandardScaler().fit(features.to_numpy())
target_scaler = StandardScaler().fit(time_target.to_numpy().reshape(-1, 1))

# Transform training data
X = features_scaler.transform(features.to_numpy())
y = target_scaler.transform(time_target.to_numpy().reshape(-1, 1)).ravel()

print("Training data shape:", X.shape)
print("Target scaler mean:", target_scaler.mean_[0])
print("Target scaler scale:", target_scaler.scale_[0])

Training data shape: (1059, 10)
Target scaler mean: 1.3870363366591123
Target scaler scale: 6.59215729404237


In [17]:
# Initialize SGD model
sgd_model = SGDRegressor(
    loss='squared_error',
    max_iter=1000,
    learning_rate='adaptive',
    eta0=0.01,
    random_state=42
)

print("SGD model initialized")
print(sgd_model)


SGD model initialized
SGDRegressor(learning_rate='adaptive', random_state=42)


### Dynamic training experiment

In [18]:
def train_model(model, X, y, epochs=1):
    """
    Train the model on the given batch of rows with partial fit
    Args:
        model: SGDRegressor model
        X: Array of feature rows (2D array)
        y: Array of target values (1D array)
    """
    # Use partial_fit for online learning
    for _ in range(epochs):
        model.partial_fit(X, y)
    return model

def predict_next_rows(model, rows):
    """
    Evaluate the model on the rows producing both an RMSE normalized and denormalized
    """
    if len(rows) == 0:
        return 0.0, 0.0
    
    # Get true values for these rows
    y_true = y[index+1:index+1+len(rows)]
    
    # Predict on normalized data
    y_pred_normalized = model.predict(rows)
    
    # Calculate normalized RMSE
    mse_normalized = mean_squared_error(y_true, y_pred_normalized)
    normalized_rmse = np.sqrt(mse_normalized)
    
    # Denormalize predictions and true values
    y_true_denorm = target_scaler.inverse_transform(y_true.reshape(-1, 1)).ravel()
    y_pred_denorm = target_scaler.inverse_transform(y_pred_normalized.reshape(-1, 1)).ravel()
    
    # Calculate denormalized RMSE
    mse_denorm = mean_squared_error(y_true_denorm, y_pred_denorm)
    denormalized_rmse = np.sqrt(mse_denorm)
    
    return normalized_rmse, denormalized_rmse

counter = 0
for index in range(1, len(X)):
    sgd_model = train_model(sgd_model, X[0: index], y[0: index])
    counter += 1
    rmse, denormalized_rmse=predict_next_rows(sgd_model,X[index+1:])
    if counter % 100 == 0:
        print(f"After {counter} training samples: \n RMSE: {rmse}, Denormalized RMSE: {denormalized_rmse}")


After 100 training samples: 
 RMSE: 1.0059768140157193, Denormalized RMSE: 6.631557392151228
After 200 training samples: 
 RMSE: 1.0619029954057468, Denormalized RMSE: 7.000231576729437
After 300 training samples: 
 RMSE: 1.1245272603661478, Denormalized RMSE: 7.413060581772185
After 400 training samples: 
 RMSE: 1.1601299114353778, Denormalized RMSE: 7.647758857705453
After 500 training samples: 
 RMSE: 1.2569196962761335, Denormalized RMSE: 8.285812343832236
After 600 training samples: 
 RMSE: 1.2625764952861083, Denormalized RMSE: 8.323102852686771
After 700 training samples: 
 RMSE: 1.268253784349954, Denormalized RMSE: 8.360528435199388
After 800 training samples: 
 RMSE: 1.461693107900496, Denormalized RMSE: 9.635710882897714
After 900 training samples: 
 RMSE: 0.6956945100381301, Denormalized RMSE: 4.586127638773092
After 1000 training samples: 
 RMSE: 0.9705139198112929, Denormalized RMSE: 6.397780415453665


## Cross-Validation Evaluation

In [19]:
# Perform cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
cv_rmse_scores = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print("num training samples", len(train_idx))
    print("num validation samples", len(val_idx))
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    # Train model
    model_fold = SGDRegressor(
        loss='squared_error',
        max_iter=1000,
        learning_rate='adaptive',
        eta0=0.01,
        random_state=42
)
    
    # Use partial_fit for online learning (multiple passes)
    n_passes = 10  # Number of epochs
    for epoch in range(n_passes):
        model_fold.partial_fit(X_train_fold, y_train_fold)

    
    # Predict and evaluate
    y_pred_fold = model_fold.predict(X_val_fold)
    
    mse = mean_squared_error(y_val_fold, y_pred_fold)
    mae = mean_absolute_error(y_val_fold, y_pred_fold)
    
    # Inverse transform to get actual values
    y_val_actual = target_scaler.inverse_transform(y_val_fold.reshape(-1, 1)).ravel()
    y_pred_actual = target_scaler.inverse_transform(y_pred_fold.reshape(-1, 1)).ravel()
    
    mse_actual = mean_squared_error(y_val_actual, y_pred_actual)
    rmse_actual = rmse_denormalized(y_val_fold, y_pred_fold, target_scaler)
    cv_scores.append(mse_actual)
    cv_rmse_scores.append(rmse_actual)
    
    print(f"Fold {fold + 1}: MSE={mse_actual:.2f}, RMSE={rmse_actual:.2f}")

print(f"\nCross-Validation Results:")
print(f"Mean MSE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")
print(f"Mean MAE: {np.mean(cv_rmse_scores):.2f} ± {np.std(cv_rmse_scores):.2f}")


num training samples 847
num validation samples 212
Fold 1: MSE=35.85, RMSE=5.99
num training samples 847
num validation samples 212
Fold 2: MSE=0.65, RMSE=0.80
num training samples 847
num validation samples 212
Fold 3: MSE=43.65, RMSE=6.61
num training samples 847
num validation samples 212
Fold 4: MSE=89.59, RMSE=9.47
num training samples 848
num validation samples 211
Fold 5: MSE=42.37, RMSE=6.51

Cross-Validation Results:
Mean MSE: 42.42 ± 28.34
Mean MAE: 5.87 ± 2.81


## Final Model Training


In [21]:
# Train final model on all training data
print("Training final SGD model on all training data...")
# Train using partial_fit for online learning
n_passes = 10  # Number of epochs
for epoch in range(n_passes):
    sgd_model.partial_fit(X, y)
print(f"Training completed with {n_passes} epochs!")
print("Model training completed!")

# Evaluate on training data
y_train_pred = sgd_model.predict(X)

# Inverse transform to get actual values
y_train_actual = target_scaler.inverse_transform(y.reshape(-1, 1)).ravel()
y_train_pred_actual = target_scaler.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()

train_mse = mean_squared_error(y_train_actual, y_train_pred_actual)
rmse = rmse_denormalized(y, y_train_pred, target_scaler   )

print(f"\nTraining Results:")
print(f"MSE: {train_mse:.2f}")
print(f"RMSE: {rmse:.2f}")


Training final SGD model on all training data...
Training completed with 10 epochs!
Model training completed!

Training Results:
MSE: 41.21
RMSE: 6.42


## Save Model


In [22]:
# Save the model and scalers
import os
os.makedirs('model', exist_ok=True)

joblib.dump(sgd_model, 'time_model/time_model_sgd.pkl')
joblib.dump(features_scaler, 'time_model/scaler_x_time.pkl')
joblib.dump(target_scaler, 'time_model/scaler_y_time.pkl')



['time_model/scaler_y_time.pkl']