In [None]:
# Task 3: Calories Burned Prediction with Pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset
data = pd.read_csv("calories.csv")
print("Dataset columns:", data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

# Identify target column (calories)
target_col = [col for col in data.columns if 'calories' in col.lower()][0]
X = data.drop(target_col, axis=1)
y = data[target_col]

# Separate categorical and numerical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

# Full pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split data and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"\nModel Performance - RMSE: {rmse:.2f}")

# Predict for new data
new_data = pd.DataFrame([{
    col: 25 if 'age' in col.lower() else
         70 if 'weight' in col.lower() else
         175 if 'height' in col.lower() else
         60 if 'duration' in col.lower() else
         130 if 'heart' in col.lower() else
         X[col].mode()[0] if X[col].dtype == 'object' else X[col].mean()
    for col in X.columns
}])

predicted_calories = pipeline.predict(new_data)[0]
print(f"\nPredicted Calories Burned: {predicted_calories:.2f}")

Columns in CSV: ['User_ID', 'Gender', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
    User_ID  Gender  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  \
0  14733363    male   68   190.0    94.0      29.0       105.0       40.8   
1  14861698  female   20   166.0    60.0      14.0        94.0       40.3   
2  11179863    male   69   179.0    79.0       5.0        88.0       38.7   
3  16180408  female   34   179.0    71.0      13.0       100.0       40.5   
4  17771927  female   27   154.0    58.0      10.0        81.0       39.8   

   Calories  
0     231.0  
1      66.0  
2      26.0  
3      71.0  
4      35.0  
Test RMSE: 11.49
Predicted Calories Burned: 443.12
