In [2]:
import pandas as pd
import numpy as np

In [3]:
dataset = pd.read_csv("dataset.csv")

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_features = ["job_type", "model"]
ohe = OneHotEncoder(sparse_output=False)
categorical_encoded = ohe.fit_transform(dataset[categorical_features])

numerical_features = ["dataset_size", "batch_size", "epochs", "worker_cpu", "worker_gpu", "worker_mem"]
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(dataset[numerical_features])

x = np.hstack([numerical_scaled, categorical_encoded])
y = dataset['runtime'].values

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=1
)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}, R2: {r2:.3f}")

MSE: 7493.224, R2: 0.866


In [7]:
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.1,
    n_jobs=1,
    random_state=42
)

xgb_model.fit(x_train, y_train)

y_pred = xgb_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}, R2: {r2:.3f}")

MSE: 27986.754, R2: 0.499


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [10, 15, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
search = RandomizedSearchCV(rf_model, param_grid, cv=3, n_iter=10, scoring="r2", verbose=2)
search.fit(x_train, y_train)

best_model = search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   7.3s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   6.2s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   7.4s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   5.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   3.9s
[CV] END max_depth=No

In [11]:
y_pred = best_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}, R2: {r2:.3f}")

MSE: 7480.036, R2: 0.866


In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

degree = 2
poly_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
poly_model.fit(x_train, y_train)
y_pred = poly_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.3f}, R2: {r2:.3f}")

MSE: 18684.645, R2: 0.665


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

categorical_features = ["job_type", "model"]
numerical_features = ["dataset_size", "batch_size", "epochs", "worker_cpu", "worker_gpu", "worker_mem"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(sparse_output=False), categorical_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor())
])

pipeline.fit(dataset[numerical_features + categorical_features], dataset["runtime"])
joblib.dump(pipeline, "runtime_predictor.pkl")


['runtime_predictor.pkl']