In [None]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import warnings
warnings.filterwarnings('ignore')

print("üöÄ PYCARET SETUP COMPLETO - FULL POWER")
print("=" * 60)

# ===== CARREGAR DADOS =====
train_path = '/Users/marcelosilva/Desktop/projectOne/5/A-TrainTestDataset/TrainDS.csv'
df_train = pd.read_csv(train_path)
print(f"‚úÖ Dataset: {df_train.shape}")

# ===== VERIFICAR MEM√ìRIA DISPON√çVEL =====
import psutil
memory_gb = psutil.virtual_memory().available / (1024**3)
print(f"üíæ Mem√≥ria dispon√≠vel: {memory_gb:.1f} GB")

# ===== SETUP COMPLETO E ROBUSTO =====
print(f"\nüîß CONFIGURANDO SETUP COMPLETO (32GB RAM)")
print("=" * 60)

try:
    clf = setup(
        data=df_train,
        target='status_nutricional_who',
        
        # üö´ Features a ignorar
        ignore_features=['id_anon', 'vd_zimc'],
        
        # üìä VARI√ÅVEIS ORDINAIS (CR√çTICO!)
        ordinal_features={
            'def_idade_gest': [0, 1, 2],           # prematuro < adequado < pos_termo
            'adequacao_prenatal': [0, 1, 2],       # ausente < insuficiente < adequado
            'idade_mae_cat': [0, 1, 2],            # jovem < adulta < madura
            'peso_cat': [0, 1, 2],                 # baixo < normal < alto
            'classificacao_peso': [0, 1, 2]        # PIG < AIG < GIG
        },
        
        # üéØ BALANCEAMENTO (CLASSE MINORIT√ÅRIA 2.4%)
        fix_imbalance=True,
        fix_imbalance_method='smote',
        
        # üîß PREPROCESSING COMPLETO
        remove_multicollinearity=True,
        multicollinearity_threshold=0.95,
        feature_selection=True,
        feature_selection_method='univariate',
        transformation=True,
        normalize=True,
        
        # üìä VALIDA√á√ÉO ROBUSTA
        fold=10,                               # 10-fold CV (robusto!)
        fold_strategy='stratifiedkfold',       # Manter propor√ß√µes
        
        # ‚ö° PERFORMANCE
        train_size=0.8,                        # 80% treino, 20% valida√ß√£o
        session_id=123,                        # Reprodutibilidade
        use_gpu=False,                         # CPU (M1 √© potente)
        n_jobs=-1,                            # Usar todos os cores
        
        # üìà CONFIGURA√á√ïES AVAN√áADAS  
        profile=False,                         # Economizar tempo
    )
    
    print("‚úÖ SETUP COMPLETO FUNCIONOU!")
    
    # Verificar configura√ß√£o
    print(f"\nüìä INFORMA√á√ïES DO SETUP:")
    print(f"   Shape transformado: {get_config('X_train').shape}")
    print(f"   SMOTE aplicado: {'Sim' if get_config('fix_imbalance') else 'N√£o'}")
    print(f"   Folds: {get_config('fold')}")
    
except Exception as e:
    print(f"‚ùå ERRO no setup completo: {e}")
    exit()
# ===== VER FEATURES SELECIONADAS ===== üëà COLE AQUI!
print("üîç FEATURES QUE PASSARAM PELA SELE√á√ÉO:")
print("=" * 60)

# Features que sobreviveram
selected_features = get_config('X_train').columns
print(f"‚úÖ FEATURES SELECIONADAS ({len(selected_features)}):")
for i, feature in enumerate(selected_features, 1):
    print(f"   {i:2d}. {feature}")

# Features originais (sem target e ignoradas)
original_features = df_train.drop(['status_nutricional_who', 'id_anon', 'vd_zimc'], axis=1).columns
print(f"\nüìä ESTAT√çSTICAS:")
print(f"   Features originais: {len(original_features)}")
print(f"   Features selecionadas: {len(selected_features)}")
print(f"   Taxa de reten√ß√£o: {(len(selected_features)/len(original_features))*100:.1f}%")

# Features que foram removidas
removed_features = set(original_features) - set(selected_features)
print(f"\n‚ùå FEATURES REMOVIDAS ({len(removed_features)}):")
for i, feature in enumerate(sorted(removed_features), 1):
    print(f"   {i:2d}. {feature}")

print("\nüéØ AN√ÅLISE DAS FEATURES SOBREVIVENTES:")
# ===== FIM DO C√ìDIGO NOVO =====    

# ===== COMPARA√á√ÉO COMPLETA DE MODELOS =====
print(f"\nü§ñ COMPARA√á√ÉO COMPLETA - TODOS OS MODELOS")
print("=" * 60)

# TODOS OS MODELOS FUNCIONANDO (16 modelos!)
all_models = [
    # Tree-based (melhores para dados estruturados)
    'rf',           # Random Forest
    'et',           # Extra Trees  
    'dt',           # Decision Tree
    'xgboost',      # XGBoost
    'gbc',          # Gradient Boosting
    'ada',          # AdaBoost
    
    # Linear models
    'lr',           # Logistic Regression
    'ridge',        # Ridge Classifier
    'lda',          # Linear Discriminant Analysis
    'qda',          # Quadratic Discriminant Analysis
    
    # SVM
    'svm',          # SVM Linear
    'rbfsvm',       # SVM RBF
    
    # Instance-based
    'knn',          # K-Nearest Neighbors
    
    # Probabilistic
    'nb',           # Naive Bayes
    'gpc',          # Gaussian Process
    
    # üß† Deep Learning
    'mlp'           # Neural Network
]

print(f"üéØ Testando {len(all_models)} modelos com setup completo...")
print("üìã Modelos inclu√≠dos:")
for i, model in enumerate(all_models, 1):
    print(f"   {i:2d}. {model}")

print(f"\n‚è≥ Executando compara√ß√£o completa (pode demorar 5-10 minutos)...")
print("üéØ Configura√ß√£o: 10-fold CV + SMOTE + vari√°veis ordinais + feature selection")

try:
    # üöÄ COMPARA√á√ÉO COMPLETA COM TODAS CONFIGURA√á√ïES!
    comparison_full = compare_models(
        include=all_models,
        sort='F1',                    # F1-score para dados desbalanceados
        n_select=10,                  # Top 10 modelos
        fold=10,                      # 10-fold CV (robusto!)
        verbose=True,                 # Mostrar progresso
        turbo=False                   # N√£o economizar - usar configura√ß√£o completa
    )
    
    print("\nüèÜ COMPARA√á√ÉO COMPLETA FINALIZADA!")
    print("üìä Resultados com TODAS as configura√ß√µes aplicadas:")
    print("   ‚úÖ 10-fold Cross Validation")
    print("   ‚úÖ SMOTE para classe minorit√°ria") 
    print("   ‚úÖ Vari√°veis ordinais preservadas")
    print("   ‚úÖ Feature selection aplicada")
    print("   ‚úÖ Multicolinearidade removida")
    print("   ‚úÖ Normaliza√ß√£o aplicada")
    
    print(f"\nü•á MELHOR MODELO: {type(comparison_full[0]).__name__}")
    
except Exception as e:
    print(f"‚ùå ERRO na compara√ß√£o completa: {e}")
    print("üîß Tentando vers√£o reduzida...")
    
    # Fallback: modelos mais est√°veis
    stable_models = ['rf', 'xgboost', 'gbc', 'lr', 'nb', 'mlp', 'et', 'ada']
    
    try:
        comparison_stable = compare_models(
            include=stable_models,
            sort='F1',
            n_select=8,
            fold=10,
            verbose=True
        )
        print("‚úÖ COMPARA√á√ÉO EST√ÅVEL FUNCIONOU!")
        
    except Exception as e2:
        print(f"‚ùå ERRO tamb√©m na vers√£o est√°vel: {e2}")

print(f"\nüéØ COMPARA√á√ÉO COMPLETA FINALIZADA!")
print("üìà Agora os resultados devem ser MUITO melhores!")

üöÄ PYCARET SETUP COMPLETO - FULL POWER
‚úÖ Dataset: (3643, 39)
üíæ Mem√≥ria dispon√≠vel: 17.6 GB

üîß CONFIGURANDO SETUP COMPLETO (32GB RAM)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,status_nutricional_who
2,Target type,Multiclass
3,Original data shape,"(3643, 39)"
4,Transformed data shape,"(9221, 8)"
5,Transformed train set shape,"(8492, 8)"
6,Transformed test set shape,"(729, 8)"
7,Ignore features,2
8,Ordinal features,8
9,Numeric features,30


‚úÖ SETUP COMPLETO FUNCIONOU!

üìä INFORMA√á√ïES DO SETUP:
   Shape transformado: (2914, 36)
‚ùå ERRO no setup completo: 'ClassificationExperiment' object has no attribute 'fix_imbalance'

ü§ñ COMPARA√á√ÉO COMPLETA - TODOS OS MODELOS
üéØ Testando 16 modelos com setup completo...
üìã Modelos inclu√≠dos:
    1. rf
    2. et
    3. dt
    4. xgboost
    5. gbc
    6. ada
    7. lr
    8. ridge
    9. lda
   10. qda
   11. svm
   12. rbfsvm
   13. knn
   14. nb
   15. gpc
   16. mlp

‚è≥ Executando compara√ß√£o completa (pode demorar 5-10 minutos)...
üéØ Configura√ß√£o: 10-fold CV + SMOTE + vari√°veis ordinais + feature selection


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.6263,0.5398,0.6263,0.5685,0.5921,0.0055,0.0055,0.08
gbc,Gradient Boosting Classifier,0.6136,0.5562,0.6136,0.5828,0.5912,0.0287,0.0304,0.578
rf,Random Forest Classifier,0.5604,0.5387,0.5604,0.5835,0.5709,0.0362,0.0364,0.193
et,Extra Trees Classifier,0.5374,0.5497,0.5374,0.5845,0.5589,0.0375,0.038,0.198
dt,Decision Tree Classifier,0.4705,0.5098,0.4705,0.5743,0.5125,0.0169,0.0177,0.046
ada,Ada Boost Classifier,0.3926,0.5208,0.3926,0.5835,0.4493,0.0269,0.0309,0.074
knn,K Neighbors Classifier,0.3535,0.5326,0.3535,0.595,0.426,0.0286,0.034,0.042
rbfsvm,SVM - Radial Kernel,0.3184,0.5475,0.3184,0.5933,0.3912,0.0284,0.0352,0.787
lda,Linear Discriminant Analysis,0.3154,0.5634,0.3154,0.5865,0.3859,0.0264,0.0323,0.045
lr,Logistic Regression,0.301,0.5664,0.301,0.5889,0.3708,0.028,0.0357,0.042


Processing:   0%|          | 0/78 [00:00<?, ?it/s]

In [None]:
# Verificar quais features sobreviveram
selected_features = get_config('X_train').columns
print("Features selecionadas:")
print(selected_features.tolist())

# Ver quais foram removidas
original_features = df_train.drop(['status_nutricional_who', 'id_anon', 'vd_zimc'], axis=1).columns
removed_features = set(original_features) - set(selected_features)
print(f"\nFeatures removidas ({len(removed_features)}):")
print(list(removed_features))