# Analysis of MLFlow experiment

In [None]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Replace with your experiment name or ID
experiment_name = "carbs_indefinite_normalized"
exp = mlflow.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(
    experiment_ids=[exp.experiment_id],
    output_format="pandas"
)

runs

In [None]:
runs.columns

In [None]:
# get me the first and last started at time
runs['start_time'] = pd.to_datetime(runs['start_time'], unit='ms')
runs['end_time'] = pd.to_datetime(runs['end_time'], unit='ms')
runs = runs.sort_values(by='start_time')
# remove all data before 2025-08-15
runs = runs[runs['start_time'] >= '2025-08-15']
runs

In [None]:
# plot scatter metrics.output vs metrics.cost_sec
plt.figure(figsize=(10, 6))
plt.scatter(runs['metrics.cost_sec'], runs['metrics.output'], alpha=0.7
              , c=pd.to_datetime(runs['start_time']).astype(int)  # color by start_time
              , cmap='viridis'
              )
plt.colorbar(label='Start Time')
plt.xlabel('Cost (seconds)')
plt.ylabel('Output')
plt.title('Output vs Cost Scatter Plot')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.show()

In [None]:
# get the highest output run
best_output_run = runs.loc[runs['metrics.output'].idxmax()]
best_output_run

In [None]:
corr = runs.corr(numeric_only=True)["metrics.output"].sort_values(ascending=False)
print(corr)

In [None]:
runs

In [None]:
def pareto_front(df, x_col="metrics.cost_sec", y_col="metrics.output"):
    # Sort ascending on cost, descending on output
    df_sorted = df.sort_values([x_col, y_col], ascending=[True, False])
    pareto = []
    best_y = -float("inf")
    for _, row in df_sorted.iterrows():
        if row[y_col] > best_y:
            pareto.append(row)
            best_y = row[y_col]
    return pd.DataFrame(pareto)

pareto_runs = pareto_front(runs)
plt.figure(figsize=(10, 6))
plt.scatter(runs['metrics.cost_sec'], runs['metrics.output'], alpha=0.3
                , c=pd.to_datetime(runs['start_time']).astype(int) 
                , cmap='viridis'
                )
plt.plot(pareto_runs['metrics.cost_sec'], pareto_runs['metrics.output'], color='red', marker='o', label='Pareto Front')
plt.colorbar(label='Start Time')
plt.xlabel('Cost (seconds)')
plt.ylabel('Output')
plt.title('Output vs Cost with Pareto Front')
plt.xscale('log')
plt.yscale('log')
plt.legend()

In [None]:
pareto_runs

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_cols = [c for c in runs.columns if c.startswith("params.")]
X = runs[param_cols].apply(pd.to_numeric, errors="coerce").fillna(0)
y = runs["metrics.output"]

model = RandomForestRegressor(n_estimators=200, random_state=0)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=param_cols).sort_values(ascending=False)
print(importances)

In [None]:
top = runs.nlargest(int(len(runs)*0.01), "metrics.output")
top_params = top[param_cols].mode()
top_params

In [None]:
top

In [None]:
param_cols

In [None]:
import itertools

search_space = {
    "params.num_iterations": [50,100,200,300,400,500,1000],
    "params.lr": [0.000303, 0.0005, 0.0007, 0.0009, 0.001, 0.002],
    "params.epsilon": [0.00001, 0.00005, 0.0001, 0.0003, 0.001, 0.005, 0.05, 0.1],
    "params.q_lambda": [0.85, 0.9, 0.92,0.95755,0.9,0.95,0.99],
    "params.gamma": [0.85, 0.9, 0.92, 0.95, 0.99],
    "params.update_epochs": [1,2],
    "params.num_minibatches": [8,9,10,11,12,13],
}

grid = pd.DataFrame(
    list(itertools.product(*search_space.values())),
    columns=search_space.keys()
)

preds = model.predict(grid)
grid["pred_output"] = preds

grid.sort_values("pred_output", ascending=False).head(10)

In [None]:
8e-4