In [1]:

# Optimized TensorFlow imports for faster loading
import os
import joblib
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Disable GPU for faster imports
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable optimizations that slow imports

import pandas as pd
import numpy as np
# Import matplotlib only when needed for plotting
# import matplotlib.pyplot as plt

# Optimized TensorFlow imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Import only what's needed from sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error





In [7]:
def data_preprocessing(data):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.iloc[:, :-1]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))
    memory_target = data.iloc[:, -2]
    time_target = data.iloc[:, -1]
    return features, memory_target, time_target

def data_preprocessing_old(data):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.iloc[:, :-1]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))

    target = data.iloc[:, -1]
    return features, target

# Training

In [8]:

# Load and preprocess data
data = pd.read_csv('data/memory_training_data.csv')
features, memory_target, time_target = data_preprocessing(data)

# Fit scalers on training data (correct approach)
features_scaler = StandardScaler().fit(features.to_numpy())
target_scaler = StandardScaler().fit(memory_target.to_numpy().reshape(-1, 1))

# Transform training data
X = features_scaler.transform(features.to_numpy())
y = target_scaler.transform(memory_target.to_numpy().reshape(-1, 1))

print("Training data shape:", X.shape)
print("Target scaler mean:", target_scaler.mean_[0])
print("Target scaler scale:", target_scaler.scale_[0])


Training data shape: (1059, 22)
Target scaler mean: 76030.06610009442
Target scaler scale: 65294.42703903871


In [26]:

def build_model_32_32():
    model = Sequential([
        Dense(32, activation='relu', input_shape=(X.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


def build_model_32_32_32():
    model = Sequential([
        Dense(32, activation='relu', input_shape=(X.shape[1],)),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


def build_model_64_64():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model



In [10]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold


def train_model(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    mse_scores_normalized = []
    mse_scores_denormalized = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=0)

        # Evaluate on validation set
        y_pred = model.predict(X_val).flatten()
        mse = mean_squared_error(y_val, y_pred)
        mse_scores_normalized.append(mse)

        y_original = target_scaler.inverse_transform(y_val.reshape(-1, 1)).flatten()
        y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
        mse = mean_squared_error(y_original, y_pred_original)
        mse_scores_denormalized.append(mse)

    rmse_normalized = np.sqrt(mse_scores_normalized)
    rmse_denormalized = np.sqrt(mse_scores_denormalized)


    print("MSE scores per fold normalized:", mse_scores_normalized)
    print("Average MSE normalized:", np.mean(mse_scores_normalized))
    print("Average RMSE normalized:", np.mean(np.sqrt(rmse_normalized)))

    print("MSE scores per fold denormalized:", mse_scores_denormalized)
    print("Average MSE denormalized:", np.mean(mse_scores_denormalized))
    print("Average RMSE denormalized:", np.mean(np.sqrt(rmse_denormalized)))

    return model


In [11]:
modeL_32_32 = build_model_32_32()
modeL_32_32 = train_model(modeL_32_32)

MSE scores per fold normalized: [0.0007889527649378372, 0.00036701755595519814, 0.00014318278126069915, 6.676668295265827e-05, 0.00013849572887092546]
Average MSE normalized: 0.0003008831027954636
Average RMSE normalized: 0.12285443784605551
MSE scores per fold denormalized: [3363591.4235578575, 1564728.313544939, 610439.9069231141, 284650.7363911035, 590456.193915923]
Average MSE denormalized: 1282773.3148665875
Average RMSE denormalized: 31.392714056980964


In [78]:
modeL_32_32_32 = build_model_32_32_32()
modeL_32_32_32 = train_model(modeL_32_32_32)

MSE scores per fold normalized: [0.0010402829765146825, 0.0010685254869958237, 0.0016953465766645537, 0.0009949916146329337, 0.0015583454213635634]
Average MSE normalized: 0.0012714984152343112
Average RMSE normalized: 0.18791938442104478
MSE scores per fold denormalized: [5965903.668121852, 6127874.85558521, 9722620.803918969, 5706162.568171195, 8936939.812908104]
Average MSE denormalized: 7291900.3417410655
Average RMSE denormalized: 51.713365604732324


In [12]:
modeL_64_64 = build_model_64_64()
modeL_64_64 = train_model(modeL_64_64)

MSE scores per fold normalized: [0.0008439689618510299, 0.0003287063959296532, 0.00013910883427778673, 7.785796768343372e-05, 1.2098889432348258e-05]
Average MSE normalized: 0.0002803482098348504
Average RMSE normalized: 0.1133213952241672
MSE scores per fold denormalized: [3598147.758808856, 1401394.3333074462, 593071.7103568743, 331935.8391115441, 51581.76094445685]
Average MSE denormalized: 1195226.2805058355
Average RMSE denormalized: 28.956756547593198


In [79]:
modeL_64_64 = build_model_64_64()
modeL_64_64 = train_model(modeL_64_64)

MSE scores per fold normalized: [0.0013119422666795723, 0.001168014017426567, 0.0016411668314494797, 0.0012422274683996104, 0.0012820575477329706]
Average MSE normalized: 0.00132908162633764
Average RMSE normalized: 0.19068425902438474
MSE scores per fold denormalized: [7523839.138258648, 6698429.135867881, 9411905.252626376, 7124034.574676393, 7352453.688970384]
Average MSE denormalized: 7622132.358079937
Average RMSE denormalized: 52.47422728393023


## Test Data

In [17]:
test_data = pd.read_csv('data/memory_test_data.csv')
X_test, y_test_memory, y_test_time = data_preprocessing(test_data)

X_test_normalized = features_scaler.transform(X_test.to_numpy())
y_test_normalized = target_scaler.transform(y_test_memory.to_numpy().reshape(-1, 1))

In [19]:
# Train final model on full training data
final_model = build_model_64_64()
final_model.fit(X, y, epochs=30, batch_size=32, verbose=0)

# Make predictions on test set
y_test_pred_normalized = final_model.predict(X_test_normalized).flatten()

# Calculate normalized RMSE
mse_normalized = mean_squared_error(y_test_normalized, y_test_pred_normalized)
rmse_normalized = np.sqrt(mse_normalized)

print(f"Test RMSE (normalized): {rmse_normalized:.6f}")


Test RMSE (normalized): 0.019080


In [21]:
# Calculate denormalized RMSE and diagnose the issue
y_test_original = target_scaler.inverse_transform(y_test_normalized.reshape(-1, 1)).flatten()
y_test_pred_original = target_scaler.inverse_transform(y_test_pred_normalized.reshape(-1, 1)).flatten()

mse_denormalized = mean_squared_error(y_test_original, y_test_pred_original)
rmse_denormalized = np.sqrt(mse_denormalized)

print(f"Test RMSE (normalized): {rmse_normalized:.6f}")
print(f"Test RMSE (denormalized): {rmse_denormalized:.6f}")

# Diagnostic information
print(f"\n=== DIAGNOSTIC INFORMATION ===")
print(f"Target scaler mean: {target_scaler.mean_[0]:.2f}")
print(f"Target scaler scale: {target_scaler.scale_[0]:.2f}")
print(f"Expected denormalized RMSE ≈ {rmse_normalized * target_scaler.scale_[0]:.2f}")

print(f"\nTraining target range: {memory_target.min():.2f} to {memory_target.max():.2f}")
print(f"Test target range: {y_test_memory.min():.2f} to {y_test_memory.max():.2f}")

print(f"\nTraining target mean: {memory_target.mean():.2f}")
print(f"Test target mean: {y_test_memory.mean():.2f}")

# Check if test data is outside training range
test_out_of_range = (y_test_memory.min() < memory_target.min()) or (y_test_memory.max() > memory_target.max())
print(f"Test data outside training range: {test_out_of_range}")

Test RMSE (normalized): 0.019080
Test RMSE (denormalized): 1245.823172

=== DIAGNOSTIC INFORMATION ===
Target scaler mean: 76030.07
Target scaler scale: 65294.43
Expected denormalized RMSE ≈ 1245.82

Training target range: 26240.00 to 236352.00
Test target range: 26288.00 to 230032.00

Training target mean: 76030.07
Test target mean: 79943.74
Test data outside training range: False


## Final Results

Test RMSE (normalized): 0.019080
Test RMSE (denormalized): 1245.823172

Training average RMSE normalized: 0.19068425902438474
Training Average RMSE denormalized: 52.47422728393023

In [24]:
def save_model(model, x_scaler, y_scaler):    
    model.save("model/memory_model.keras")
    joblib.dump(x_scaler, "model/scaler_x.pkl")
    joblib.dump(y_scaler, "model/scaler_y.pkl")


In [25]:
save_model(final_model, features_scaler, target_scaler)