# Overview
- Train an Autogluon predictor (autoML)


In [22]:
# Import necessary packages
from utils import pd, np, TabularPredictor, accuracy_score

In [23]:
# Import data 
train_df = pd.read_csv("../data/processed/train_data.csv")
test_df = pd.read_csv("../data/processed/test_data.csv")

In [24]:
# Set random seed for reproducibility
np.random.seed(42)

In [25]:
# Initialize autogluon tabular predictor
predictor = TabularPredictor(
    label='label',
    eval_metric='accuracy',
    path='../models/autogluon'
)



In [26]:
# Train autogluon tabular predictor
predictor.fit(
    train_data=train_df,
    time_limit=600,  # 10 minutes time limit
    presets='medium_quality_faster_train',  # Use medium quality preset for faster training
    num_bag_folds=5,  # Number of folds for bagging
    num_stack_levels=2,  # Number of stacking levels
    verbosity=2,  # Show detailed training progress
)

Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.9.6
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:54:49 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T6000
CPU Count:          10
Memory Avail:       34.22 GB / 64.00 GB (53.5%)
Disk Space Avail:   853.49 GB / 926.35 GB (92.1%)
Presets specified: ['medium_quality_faster_train']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/Users/miriamlandau/Documents/predict_hand_imagery/models/autogluon"
Train Data Rows:    168
Train Data Columns: 258
Label Column:       label
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [np.int64(0), np.int64(1)]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Pred

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x3c04099a0>

In [30]:
# Evaluate model performance
performance = predictor.evaluate(test_df)
performance

{'accuracy': 0.7619047619047619,
 'balanced_accuracy': np.float64(0.7716346153846154),
 'mcc': np.float64(0.5282450493547782),
 'roc_auc': np.float64(0.78125),
 'f1': 0.7222222222222222,
 'precision': 0.65,
 'recall': 0.8125}

In [31]:
# Display autogluon leadorboard
predictor.leaderboard(test_df)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L3,0.785714,0.797619,accuracy,0.584514,0.610228,29.397866,0.037381,0.032283,2.57876,3,True,34
1,NeuralNetTorch_BAG_L1,0.761905,0.779762,accuracy,0.110924,0.112273,2.304106,0.110924,0.112273,2.304106,1,True,12
2,CatBoost_BAG_L2,0.761905,0.821429,accuracy,0.283238,0.309759,19.550223,0.015824,0.02831,7.492809,2,True,19
3,XGBoost_BAG_L2,0.761905,0.797619,accuracy,0.294491,0.295251,12.816848,0.027076,0.013802,0.759434,2,True,23
4,RandomForestGini_BAG_L2,0.761905,0.732143,accuracy,0.299389,0.326428,12.416779,0.031975,0.044979,0.359366,2,True,17
5,NeuralNetFastAI_BAG_L2,0.761905,0.809524,accuracy,0.315873,0.313011,14.535238,0.048459,0.031562,2.477824,2,True,22
6,LightGBM_BAG_L3,0.761905,0.839286,accuracy,0.556002,0.587927,27.292856,0.008869,0.009983,0.47375,3,True,28
7,WeightedEnsemble_L4,0.761905,0.839286,accuracy,0.556649,0.588331,27.330467,0.000648,0.000404,0.037611,4,True,38
8,CatBoost_BAG_L3,0.761905,0.821429,accuracy,0.562952,0.61186,32.951194,0.015819,0.033916,6.132087,3,True,31
9,RandomForestGini_BAG_L3,0.761905,0.755952,accuracy,0.577109,0.625844,27.236423,0.029976,0.0479,0.417316,3,True,29


In [32]:
# Display feature importance
feature_importance = predictor.feature_importance(train_df) 
feature_importance.sort_values(by='importance', ascending=False).head()
# runs too long

Computing feature importance via permutation shuffling for 258 features using 168 rows with 5 shuffle sets...
	898.08s	= Expected runtime (179.62s per shuffle set)


KeyboardInterrupt: 

# Key Takeways

- autogluon performs with an accuracy of 0.76 with a stacked ensemble
