# Load Data

In [1]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [2]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Columns:", train.columns)

# Check for missing values
print(train.isnull().sum())

Train shape: (750000, 9)
Test shape: (250000, 8)
Columns: Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp', 'Calories'],
      dtype='object')
id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64


In [3]:
X = train.drop(columns=["Calories", "id"])
y = train["Calories"]

X_test = test.drop(columns=["id"])


In [7]:
# Convert all object (categorical) columns into category and encode as integers
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
import numpy as np

# Use log1p to reduce impact of outliers (RMSLE sensitive to large differences)
y_log = np.log1p(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred)))
print(f"Validation RMSLE: {rmsle:.5f}")


Validation RMSLE: 0.06153


In [9]:
preds = model.predict(X_test)
preds_final = np.expm1(preds)  # Invert log1p

submission = sample_submission.copy()
submission["Calories"] = preds_final
submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,Calories
0,750000,26.410599
1,750001,109.45903
2,750002,86.587944
3,750003,126.70018
4,750004,74.865303
