#### Mount G Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Imports

In [22]:
import os
import sys
import warnings
import time

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import pprint
import ipywidgets as widgets
from IPython.display import display
from autogluon.tabular import TabularPredictor, TabularDataset
from sklearn.model_selection import train_test_split
import torch # Used to check for GPU availability

pp = pprint.PrettyPrinter(indent=2)

# Custom Functions
from utils import *

# print(f"Current Working Directory --> {os.getcwd()}")
#Add one directory above research
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) # Get the parent directory
sys.path.append(parent_dir)
current_working_dir = %pwd

print(f"Parent Dir >>> {parent_dir}")
print(f"Current Working Dir >>> {current_working_dir}")

# from configs import cfgs  # Absolute import

Parent Dir >>> C:\Users\maz\dev\Projects_\alzheimer
Current Working Dir >>> C:\Users\maz\dev\Projects_\alzheimer\research


In [23]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)

#### Paths

In [24]:
# Create a path object
# dataset_dir = cfgs["DATASET_DIR"]
dataset_dir = "..//dataset//modified"
dataset_path = Path(dataset_dir)
print("Dataset Path")
print("*"*12)
print(f"Dataset: {dataset_path}")

path_train = dataset_path / "train.csv"
print(f"Train File Path --> {path_train}")

# # Path to save Splits
# split_train_csv = dataset_path / 'ag_train.csv'
# split_test_csv = dataset_path / 'ag_test.csv'

# print(f"Train Split File Path --> {split_train_csv}")
# print(f"Test Split File Path --> {split_test_csv}")

# Define the path to save the trained models
models_dir = "..//models//"
models_path = Path(models_dir)

print("\n")
print("Model's Dir")
print("*"*12)
print(f"Models Dir: {models_path}")

model_name = 'ft_engineered'
model_save_path = models_path / model_name

try:
    os.makedirs(model_save_path)
    print(f"Directory '{model_save_path}' created successfully.")
except FileExistsError:
    print(f"Directory '{model_save_path}' already exists.")
    
print(f"Model's Save Path --> {model_save_path}")

Dataset Path
************
Dataset: ..\dataset\modified
Train File Path --> ..\dataset\modified\train.csv


Model's Dir
************
Models Dir: ..\models
Model's Save Path --> ..\models\ft_engineered


#### Define Constants

In [25]:
# Define the target variable
label_column = 'composite_score'

SPECIFIC_COL_TO_DROP = ['Year']

THRESHOLD_RATIO = 0.1
MAX_UNIQUE = 50

# These are rudundant features
COLS_TO_DROP = ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12',
                   'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12',
                   'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']

COLS_TO_DROP_AFTER_FT_ENG = ['delta_hinc_business', 'delta_hinc_cap']

THRESHOLD_MISSING = 70.0

NUM_STRATEGY = "median"
CAT_STRATEGY = "mode"

THRESHOLD_QUASI_CONSTANT = 0.00000001 # 0.01 (drops -> cols: 23), 0.005 (drops -> cols: 18), 0.00000001

#### Check GPU

In [26]:
# Check if a GPU is available
num_gpus = 1 if torch.cuda.is_available() else 0
print(f"Using {num_gpus} GPUs for training.")

Using 0 GPUs for training.


#### Data Loading

In [27]:
try:
    data = pd.read_csv(path_train, encoding = 'utf8')
    df = data.copy()
    display(df.head(2))
    print(df.shape)
except FileNotFoundError:
    print("Error: 'train_with_featEng.csv' not found. Please ensure the file is in the correct location.")
    df = None
    exit()

Unnamed: 0,UID,Year,composite_score,Age_03,Urban_03,Married_03,Marriages_03,Education_03,Num_Living_Child_03,Migration_03,...,Meet_FnF_12,SocialActivities_12,AttendReligiousServices_12,a16a_12,YrsLivedInUSA_12,a22_12,a33b_12,SpeaksEnglish_12,HousingEnvironment_12,PredictionYear
0,aard,2021,104,50-59,Urban,Widowed,1.0,7-9 Years,1 or 2,0.0,...,Once a week,Never,1.0,,,,,0.0,Concrete,9
1,abme,2021,106,50-59,Rural,Married or In Civil Union,1.0,1-5 Years,5 or 6,0.0,...,Never,Never,0.0,,,,,0.0,Concrete,9


(2889, 185)


In [28]:
# print("*" * 30)
# print("After Conversion of Data Types")
# print("*" * 30)
# # Get all dtypes as a Series
# all_dtypes = df.dtypes
# print("-" * 20)
# print("All dtypes (Series):")
# print("-" * 20)
# print(f"All Data Types -> {all_dtypes}")

# # Get unique dtypes
# unique_dtypes = df.dtypes.unique()
# print("-" * 45)
# print("Unique dtypes (NumPy array of dtype objects):")
# print("-" * 45)
# print(f"Unique Data Types -> {unique_dtypes}")

# # Get Columns Names
# print("-" * 14)
# print("Column Names:")
# print("-" * 14)
# print(f"{df.columns}")

# print("-" * 18)
# print("Number of Columns:")
# print("-" * 18)
# number_of_cols = len(df.columns)
# print(f"{number_of_cols}")

#### PipeLine

In [29]:
data_type_conversion = Pipeline([
    ('specific_categorizer', SpecificColumnCategorizer(columns_to_categorize=SPECIFIC_COL_TO_DROP)),
    ('object_to_category', ObjectToCategoryTransformer(threshold_ratio=THRESHOLD_RATIO, max_unique=MAX_UNIQUE)),
    ('float_to_category', FloatToCategoryTransformer()),
    # ('bool_to_category', BooleanToCategoryTransformer())    
])

dropColumns = Pipeline([
    ('drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP)),
])

dropColumnsHighNA = Pipeline([
    ('drop_columns_high_na', DropColumnsHighNA(threshold=THRESHOLD_MISSING)),
])

missingValueImputer = Pipeline([
    ('missing_value_imputer', MissingValueImputer(num_strategy=NUM_STRATEGY, cat_strategy=CAT_STRATEGY)),
])

identifyAndDropLowVarNum = Pipeline([
    ('identify_and_drop_low_var_num', IdentifyAndDropLowVarNum(quasi_constant_threshold=THRESHOLD_QUASI_CONSTANT)),
])

# Create a transformer step from your custom function
temporal_feature_engineering = Pipeline([
    ('temporal_features', FunctionTransformer(engineer_temporal_features, kw_args={'drop_originals': True}))
])

dropColumns_after_ft_eng = Pipeline([
    ('drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP_AFTER_FT_ENG)),
])

dataPreprocessing_pipeline = Pipeline([
    ('1_data_type_conversion', data_type_conversion),
    ('2_drop_columns', dropColumns),
    ('3_drop_high_na_columns', dropColumnsHighNA),
    ('4_impute_missing_values', missingValueImputer),
    ('5_identify_and_drop_low_var_num', identifyAndDropLowVarNum),
    # You can add more steps here, e.g., scaling, encoding, etc.
    # ('5_scaling', StandardScaler()), # Example
    ('6_temporal_feature_engineering', temporal_feature_engineering),
    ('7_dropColumns_after_ft_eng', dropColumns_after_ft_eng),
])

In [30]:
%%capture
print("*" * 48)
print("--- Applying Pipeline | Data Preprocessing ---")
print("*" * 48, "\n")

df = dataPreprocessing_pipeline.fit_transform(df)

In [31]:
print(f"--- PipeLine Completed ---")

--- PipeLine Completed ---


#### Data Preparation

In [32]:
train_data, test_data = train_test_split(
    df,
    test_size=0.2,
    random_state=42  # for reproducibility
)

# Save the split data to files (optional, but good practice)
# train_data.to_csv(split_train_csv, index=False)
# test_data.to_csv(split_test_csv, index=False)

# print("\n")
# print(f"Split | 'Train' | Saved to >>> {split_train_csv}")
# print(f"Split | 'Test' | Saved to >>> {split_test_csv}")

print("\n")
print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")
print("Data preparation complete.")



Training data size: 2311
Test data size: 578
Data preparation complete.


#### Train

In [33]:
# The `num_gpus` argument controls GPU usage.
# - Set to 1 (or more) to train on GPU.
# - Set to 0 to train on CPU only.

fit_args = {
    'train_data': train_data,
    'presets': 'extreme_quality',
    'time_limit': 3600,  # Time limit in seconds (e.g., 1 hour)
    'num_gpus': num_gpus,
}

In [34]:
# Initialize the TabularPredictor
predictor = TabularPredictor(
    label=label_column,
    path=model_save_path,
    eval_metric='root_mean_squared_error' # Default for regression, but explicitly stated for clarity
)



In [35]:
# Fit the models
start_time = time.time()

predictor.fit(**fit_args)

end_time = time.time()
total_time = end_time - start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = (total_time % 3600) % 60
print(f"Total time: {hours} hours, {minutes} minutes, and {seconds:.2f} seconds")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          4
Memory Avail:       3.85 GB / 19.90 GB (19.3%)
Disk Space Avail:   233.44 GB / 953.87 GB (24.5%)
Presets specified: ['extreme_quality']
`extreme` preset uses a dynamic portfolio based on dataset size...
	Detected data size: small (<=30000 samples), using `zeroshot_2025_tabfm` portfolio.
		Note: `zeroshot_2025_tabfm` portfolio requires a CUDA compatible GPU for best performance.
		Make sure you have all the relevant dependencies installed: `pip install autogluon.tabular[tabarena]`.
		It is strongly recommended to use a machine with 64+ GB memory and a CUDA compatible GPU with 32+ GB vRAM when using this preset. 
		This portfolio will download foundation model weights from HuggingFace during training. Ensure you have an internet connection or have pre-downloaded the weights to use these models.
		Th

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x19ba2515e50>

#### Evaluation

In [None]:
# --- Evaluation and Prediction ---
print("\n--- Model Evaluation ---")
# Display the leaderboard to see the performance of all trained models on the validation data
# We can also pass the test data to see out-of-sample performance.
leaderboard = predictor.leaderboard(test_data)

In [44]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [63]:
df_lb = leaderboard[["model", "score_test", "score_val", "eval_metric", "stack_level", "can_infer", "fit_order"]]
# print(leaderboard)
# print(df_lb)
# leaderboard
df_lb

Unnamed: 0,model,score_test,score_val,eval_metric,stack_level,can_infer,fit_order
0,LightGBM_r33_BAG_L1,-45.733922,-47.389724,root_mean_squared_error,1,True,1
1,WeightedEnsemble_L2,-45.833239,-46.918285,root_mean_squared_error,2,True,7
2,TabM_r184_BAG_L1,-46.419047,-47.516198,root_mean_squared_error,1,True,3
3,XGBoost_r171_BAG_L1,-47.11115,-48.508782,root_mean_squared_error,1,True,5
4,CatBoost_BAG_L1,-47.452942,-48.194741,root_mean_squared_error,1,True,2
5,CatBoost_r51_BAG_L1,-50.188957,-50.499425,root_mean_squared_error,1,True,6
6,TabM_r69_BAG_L1,-53.710891,-54.119442,root_mean_squared_error,1,True,4


In [51]:
# Evaluate the final ensemble model on the test data
performance_dict = predictor.evaluate(test_data)
performance = pd.DataFrame(performance_dict, index=[0])
print("Final Ensemble Performance on Test Data")
print("*" * 40)
# print(performance)
performance

Final Ensemble Performance on Test Data
**************************************


Unnamed: 0,root_mean_squared_error,mean_squared_error,mean_absolute_error,r2,pearsonr,median_absolute_error
0,-45.833239,-2100.685791,-35.190224,0.379713,0.61796,-28.570831


#### Making Predictions

In [59]:
# --- Making Predictions ---
print("\n--- Making Predictions ---")
# Predict on the test data (you can also use new, unseen data)
predictions = predictor.predict(test_data.drop(columns=[label_column]))


--- Making Predictions ---


In [62]:
# Create a new DataFrame for a clear comparison
predictions_df = pd.DataFrame(predictions)
results_df = pd.DataFrame({
    'actual_score': test_data['composite_score'],
    'predicted_score': predictions_df['composite_score']
})

# --- Calculate the Difference ---
# This shows the error. A positive value means the prediction was too low.
# A negative value means the prediction was too high.
results_df['difference'] = results_df['actual_score'] - results_df['predicted_score']

# --- (Optional but Recommended) Calculate the Absolute Difference ---
# This shows the magnitude of the error, regardless of direction.
results_df['absolute_difference'] = abs(results_df['difference'])

# --- Display the results ---
print("Comparison of Actual vs. Predicted Scores:")
print("*" * 42)
# print(results_df.head())
results_df.head()

Comparison of Actual vs. Predicted Scores:
******************************************


Unnamed: 0,actual_score,predicted_score,difference,absolute_difference
471,175,214.911499,-39.911499,39.911499
1206,129,167.06279,-38.06279,38.06279
2382,78,119.418396,-41.418396,41.418396
2013,77,132.670044,-55.670044,55.670044
2885,140,117.531631,22.468369,22.468369


In [None]:
# Display a sample of the predictions
# print("Sample predictions:")
# print(predictions.head())

**The minus sign is there because the framework is designed to always maximize a score. Since Root Mean Squared Error (RMSE) is an error metric where a lower value is better, the framework negates it.**

### Detailed Breakdown

- Maximizing vs. Minimizing:
    - For metrics like `Accuracy` or `R-squared`, a higher score is better. So, an optimizer's job is to maximize them.
    - For metrics like `Root Mean Squared Error` (eval_metric), a lower score is better (an error of 0 is perfect). An optimizer's job is to minimize them.
- The Trick: Negating the Error:
    - To avoid having to build separate logic for maximizing and minimizing, many `AutoML` frameworks (like `AutoGluon`, which this looks like) use a simple trick: they treat every problem as a maximization problem.
    - To do this with an error metric, they simply multiply it by -1.
    - Minimizing `RMSE` is the same as maximizing (-RMSE).

**Is it Good or Bad?**
It is neither good nor bad; it's just a reporting convention. The key is knowing how to interpret it.
**How to Interpret the Scores:**

    - You should look for the score that is closest to zero.
    - A score of -45.8 is better than a score of -53.7.

**Look at the leaderboard (df_lb):**

- The `WeightedEnsemble_L2` has a score_test of -45.833239.
- The `TabM_r69_BAG_L1` has a score_test of -53.710891.
  
Since -45.8 is a higher number (closer to zero) than -53.7, the WeightedEnsemble_L2 model performed better.

**What is the Actual Error?**

To get the real, interpretable error metric, just take the positive value.

Based on `Final Ensemble Performance` table:

- `root_mean_squared_error` is -45.833239.
- This means the actual `Root Mean Squared Error` of the model on the test data is `45.833239`.
- Similarly, the actual `Mean Absolute Error` is `35.190224`.


#### Clean up (Optional)

In [39]:
# --- Clean up (Optional) ---
# You can load the predictor later using:
# loaded_predictor = TabularPredictor.load('ag_models_composite_score/')
# And then use it to make predictions on new data.