Dataset: https://www.kaggle.com/datasets/awodijitemitope/steps-count-for-fitness-journey-dataset

Import libs

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
df = pd.read_csv("Steps Export_ 3-17-21to9-23-22.csv")
df.head()

In [None]:
rename_map = {
    "Steps": "steps",
    "Distance": "distance_km",
    "Minutes Active": "duration_min",
    "Calories": "calories",
}
df = df.rename(columns=rename_map)
df.head()

In [None]:
required = ["steps", "duration_min", "distance_km", "calories"]
df_clean = df.dropna(subset=required)
df_clean = df_clean[df_clean["duration_min"] > 0]
df_clean = df_clean[df_clean["steps"] >= 0]
df_clean = df_clean[df_clean["calories"] > 0]
df_clean.describe()


In [None]:
df_fe = df_clean.copy()
df_fe["steps_per_min"] = df_fe["steps"] / df_fe["duration_min"]
df_fe["speed_kmh"] = df_fe["distance_km"] / (df_fe["duration_min"] / 60)
df_fe.head()


In [None]:
target = "calories"
numeric = ["steps", "duration_min", "distance_km", "steps_per_min", "speed_kmh"]
numeric = [c for c in numeric if c in df_fe.columns]
categorical = []

X = df_fe[numeric + categorical]
y = df_fe[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
numeric_t = StandardScaler()
categorical_t = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    [("num", numeric_t, numeric), ("cat", categorical_t, categorical)]
)

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model),
])


In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

mae, rmse, r2

In [None]:
joblib.dump(pipe, "calorie_predictor_kaggle_rf.joblib")

tentative prediction code

In [None]:
import joblib
import pandas as pd

pipe_loaded = joblib.load("calorie_predictor_kaggle_rf.joblib")

def predict_calories(steps, duration_min, distance_km):
    steps_per_min = steps / duration_min
    speed_kmh = distance_km / (duration_min / 60)
    sample = pd.DataFrame([{
        "steps": steps,
        "duration_min": duration_min,
        "distance_km": distance_km,
        "steps_per_min": steps_per_min,
        "speed_kmh": speed_kmh,
    }])
    return float(pipe_loaded.predict(sample)[0])

predict_calories(4000, 30, 2.8)
