In [1]:
#%%!pip install autogluon.tabular --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from autogluon.tabular import TabularPredictor, TabularDataset
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
print("--- Setup Complete ---")

# Step 1: Data Preparation (Excluding User_ID)
print("\n--- Step 1: Preparing data ---")
#### change your path
try:
    df = pd.read_csv('autogluon_train_results/raw_data/calories.csv')
    print(f"Data loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print("ERROR: 'autogluon_train_results/raw_data/calories.csv' file not found.")
    raise

# --- NEW: Exclude User_ID before any other processing ---
df = df.drop(columns=['User_ID'])
print("--> 'User_ID' column has been excluded from the dataset.")

df['Calories'] = df['Calories'].astype(float)
df_encoded = pd.get_dummies(df, columns=['Gender'], drop_first=True)

# Version 1: Unscaled Data
train_df_unscaled, test_df_unscaled = train_test_split(df_encoded, test_size=0.2, random_state=42)
print(f"Unscaled data prepared. Train shape: {train_df_unscaled.shape}")

# Version 2: Scaled Data
numeric_cols = [c for c in df_encoded.columns if c not in ['Calories', 'Gender_male']]
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
train_df_scaled, test_df_scaled = train_test_split(df_scaled, test_size=0.2, random_state=42)
print(f"Scaled data prepared. Train shape: {train_df_scaled.shape}")

# --- Save the correct test sets for the validation script ---
print("\n--- Saving test data to local CSV files ---")
test_df_unscaled.to_csv('test_data_unscaled_noID.csv', index=False)
test_df_scaled.to_csv('test_data_scaled_noID.csv', index=False)

print("--- Data preparation complete ---")

# Step 2: AutoML Training
print("\n--- Step 2: Starting AutoML training ---")

print("\n--- Training (1/2): Unscaled Data ---")
predictor_unscaled = TabularPredictor(
    label='Calories', problem_type='regression', eval_metric='root_mean_squared_error', path='autogluon_unscaled'
).fit(TabularDataset(train_df_unscaled), presets='best_quality', time_limit=300)

print("\n--- Training (2/2): Scaled Data ---")
predictor_scaled = TabularPredictor(
    label='Calories', problem_type='regression', eval_metric='root_mean_squared_error', path='autogluon_scaled'
).fit(TabularDataset(train_df_scaled), presets='best_quality', time_limit=300)

print("--- AutoML training complete ---")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.10.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Jul 15 10:36:12 UTC 2025
CPU Count:          2
Memory Avail:       1.57 GB / 3.76 GB (41.9%)
Disk Space Avail:   2.56 GB / 4.78 GB (53.5%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit 

--- Setup Complete ---

--- Step 1: Preparing data ---
Data loaded successfully. Shape: (15000, 9)
--> 'User_ID' column has been excluded from the dataset.
Unscaled data prepared. Train shape: (12000, 8)
Scaled data prepared. Train shape: (12000, 8)

--- Saving test data to local CSV files ---
--- Data preparation complete ---

--- Step 2: Starting AutoML training ---

--- Training (1/2): Unscaled Data ---


Beginning AutoGluon training ... Time limit = 75s
AutoGluon will save models to "/home/ec2-user/SageMaker/autogluon_unscaled/ds_sub_fit/sub_fit_ho"
Train Data Rows:    10666
Train Data Columns: 7
Label Column:       Calories
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1615.15 MB
	Train Data (Original)  Memory Usage: 0.50 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFea


--- Training (2/2): Scaled Data ---


Beginning AutoGluon training ... Time limit = 75s
AutoGluon will save models to "/home/ec2-user/SageMaker/autogluon_scaled/ds_sub_fit/sub_fit_ho"
Train Data Rows:    10666
Train Data Columns: 7
Label Column:       Calories
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1266.26 MB
	Train Data (Original)  Memory Usage: 0.50 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatu

--- AutoML training complete ---
