In [1]:

# Optimized TensorFlow imports for faster loading
import os
import joblib
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Disable GPU for faster imports
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable optimizations that slow imports

import pandas as pd
import numpy as np
# Import matplotlib only when needed for plotting
# import matplotlib.pyplot as plt

# Optimized TensorFlow imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Import only what's needed from sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error





In [2]:
def data_preprocessing(data):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.iloc[:, :-1]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))
    memory_target = data.iloc[:, -2]
    time_target = data.iloc[:, -1]
    return features, memory_target, time_target

def data_preprocessing_old(data):
    data['binary_name'] = data['binary_name'].str.split('/').str[-1]
    features = data.iloc[:, :-1]
    features.iloc[:,0] = features.iloc[:,0].apply(lambda x: abs(hash(x)))

    target = data.iloc[:, -1]
    return features, target

# Training

In [3]:

# Load and preprocess data
data = pd.read_csv('data/training_data.csv')
features, memory_target, time_target = data_preprocessing(data)

# Fit scalers on training data (correct approach)
features_scaler = StandardScaler().fit(features.to_numpy())
target_scaler = StandardScaler().fit(memory_target.to_numpy().reshape(-1, 1))

# Transform training data
X = features_scaler.transform(features.to_numpy())
y = target_scaler.transform(memory_target.to_numpy().reshape(-1, 1))

print("Training data shape:", X.shape)
print("Target scaler mean:", target_scaler.mean_[0])
print("Target scaler scale:", target_scaler.scale_[0])


Training data shape: (1039, 21)
Target scaler mean: 70735.3532242541
Target scaler scale: 63897.676863767854


In [4]:

def build_model_32_32():
    model = Sequential([
        Dense(32, activation='relu', input_shape=(X.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


def build_model_32_32_32():
    model = Sequential([
        Dense(32, activation='relu', input_shape=(X.shape[1],)),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


def build_model_64_64():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1)  # output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model



In [5]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold


def train_model(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    mse_scores_normalized = []
    mse_scores_denormalized = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=0)

        # Evaluate on validation set
        y_pred = model.predict(X_val).flatten()
        mse = mean_squared_error(y_val, y_pred)
        mse_scores_normalized.append(mse)

        y_original = target_scaler.inverse_transform(y_val.reshape(-1, 1)).flatten()
        y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
        mse = mean_squared_error(y_original, y_pred_original)
        mse_scores_denormalized.append(mse)

    rmse_normalized = np.sqrt(mse_scores_normalized)
    rmse_denormalized = np.sqrt(mse_scores_denormalized)


    print("MSE scores per fold normalized:", mse_scores_normalized)
    print("Average MSE normalized:", np.mean(mse_scores_normalized))
    print("Average RMSE normalized:", np.mean(np.sqrt(rmse_normalized)))

    print("MSE scores per fold denormalized:", mse_scores_denormalized)
    print("Average MSE denormalized:", np.mean(mse_scores_denormalized))
    print("Average RMSE denormalized:", np.mean(np.sqrt(rmse_denormalized)))

    return model


In [6]:
modeL_32_32 = build_model_32_32()
modeL_32_32 = train_model(modeL_32_32)

MSE scores per fold normalized: [0.0009477299459771471, 0.00027440147571780604, 0.0002512071505748038, 0.00025805398463400113, 0.00021089908371480743]
Average MSE normalized: 0.00038845832812371315
Average RMSE normalized: 0.13546204780898727
MSE scores per fold denormalized: [3869500.613999293, 1120356.630235452, 1025656.1397250616, 1053609.966537934, 861085.5268348289]
Average MSE denormalized: 1586041.775466514
Average RMSE denormalized: 34.24208320434995


In [7]:
modeL_32_32_32 = build_model_32_32_32()
modeL_32_32_32 = train_model(modeL_32_32_32)

MSE scores per fold normalized: [0.0007488197519334065, 0.0003492657275182174, 0.00043118573551030614, 0.0001696568661155276, 0.0001444574494359142]
Average MSE normalized: 0.0003686771061026743
Average RMSE normalized: 0.1339978256278141
MSE scores per fold denormalized: [3057370.0879063974, 1426019.6726177656, 1760498.2387144016, 692691.6316181513, 589805.6560459413]
Average MSE denormalized: 1505277.0573805314
Average RMSE denormalized: 33.87195255474952


In [8]:
modeL_64_64 = build_model_64_64()
modeL_64_64 = train_model(modeL_64_64)

MSE scores per fold normalized: [0.0003943105392221005, 0.0004114478049461689, 0.00010141128547494544, 6.439411282995394e-05, 3.6663754209227614e-05]
Average MSE normalized: 0.00020164549933647928
Average RMSE normalized: 0.11021672657826931
MSE scores per fold denormalized: [1609934.9474880327, 1679911.5495133768, 414055.2181651409, 262915.17635530693, 149695.55386241968]
Average MSE denormalized: 823302.4890768554
Average RMSE denormalized: 27.86058627430888


## Test Data

In [10]:
test_data = pd.read_csv('data/testing_data.csv')
X_test, y_test_memory, y_test_time = data_preprocessing(test_data)

X_test_normalized = features_scaler.transform(X_test.to_numpy())
y_test_normalized = target_scaler.transform(y_test_memory.to_numpy().reshape(-1, 1))

In [11]:
# Train final model on full training data
final_model = build_model_64_64()
final_model.fit(X, y, epochs=30, batch_size=32, verbose=0)

# Make predictions on test set
y_test_pred_normalized = final_model.predict(X_test_normalized).flatten()

# Calculate normalized RMSE
mse_normalized = mean_squared_error(y_test_normalized, y_test_pred_normalized)
rmse_normalized = np.sqrt(mse_normalized)

print(f"Test RMSE (normalized): {rmse_normalized:.6f}")


Test RMSE (normalized): 0.020499


In [12]:
# Calculate denormalized RMSE and diagnose the issue
y_test_original = target_scaler.inverse_transform(y_test_normalized.reshape(-1, 1)).flatten()
y_test_pred_original = target_scaler.inverse_transform(y_test_pred_normalized.reshape(-1, 1)).flatten()

mse_denormalized = mean_squared_error(y_test_original, y_test_pred_original)
rmse_denormalized = np.sqrt(mse_denormalized)

print(f"Test RMSE (normalized): {rmse_normalized:.6f}")
print(f"Test RMSE (denormalized): {rmse_denormalized:.6f}")

# Diagnostic information
print(f"\n=== DIAGNOSTIC INFORMATION ===")
print(f"Target scaler mean: {target_scaler.mean_[0]:.2f}")
print(f"Target scaler scale: {target_scaler.scale_[0]:.2f}")
print(f"Expected denormalized RMSE ≈ {rmse_normalized * target_scaler.scale_[0]:.2f}")

print(f"\nTraining target range: {memory_target.min():.2f} to {memory_target.max():.2f}")
print(f"Test target range: {y_test_memory.min():.2f} to {y_test_memory.max():.2f}")

print(f"\nTraining target mean: {memory_target.mean():.2f}")
print(f"Test target mean: {y_test_memory.mean():.2f}")

# Check if test data is outside training range
test_out_of_range = (y_test_memory.min() < memory_target.min()) or (y_test_memory.max() > memory_target.max())
print(f"Test data outside training range: {test_out_of_range}")

Test RMSE (normalized): 0.020499
Test RMSE (denormalized): 1309.815920

=== DIAGNOSTIC INFORMATION ===
Target scaler mean: 70735.35
Target scaler scale: 63897.68
Expected denormalized RMSE ≈ 1309.82

Training target range: 25040.00 to 233504.00
Test target range: 25056.00 to 233600.00

Training target mean: 70735.35
Test target mean: 78808.64
Test data outside training range: True


## Final Results

Test RMSE (normalized): 0.020499
Test RMSE (denormalized): 1309.815920

Training average RMSE normalized: 0.11021672657826931
Training Average RMSE denormalized: 27.86058627430888

In [16]:
def save_model(model, x_scaler, y_scaler):    
    model.save("memory_model/memory_model.keras")
    joblib.dump(x_scaler, "memory_model/scaler_x.pkl")
    joblib.dump(y_scaler, "memory_model/scaler_y.pkl")


In [17]:
save_model(final_model, features_scaler, target_scaler)