## 1. Importa√ß√£o de Bibliotecas e Configura√ß√µes

In [1]:
# Importa√ß√µes essenciais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from scipy import stats

warnings.filterwarnings('ignore')


# Configura√ß√µes de visualiza√ß√£o
plt.rcParams['figure.figsize'] = [12, 8]
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


## 2. Carregamento e An√°lise Inicial dos Dados

In [3]:
# Carregar dataset original
print("Carregando dataset de sepsis...")
df = pd.read_csv('dataset_sepsis_train.csv')

print(f"Shape do dataset: {df.shape}")
print(f"Total de registros: {len(df):,}")
# Informa√ß√µes b√°sicas
print("\nInforma√ß√µes do Dataset:")
print(df.info())

Carregando dataset de sepsis...
Shape do dataset: (1241768, 42)
Total de registros: 1,241,768

Informa√ß√µes do Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241768 entries, 0 to 1241767
Data columns (total 42 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Hour              1241768 non-null  int64  
 1   HR                1119123 non-null  float64
 2   O2Sat             1079708 non-null  float64
 3   Temp              419945 non-null   float64
 4   SBP               1060857 non-null  float64
 5   MAP               1087236 non-null  float64
 6   DBP               852691 non-null   float64
 7   Resp              1051181 non-null  float64
 8   EtCO2             46047 non-null    float64
 9   BaseExcess        67324 non-null    float64
 10  HCO3              52334 non-null    float64
 11  FiO2              103618 non-null   float64
 12  pH                86094 non-null    float64
 13  PaCO2             69132 no

In [None]:
# An√°lise dos valores missing
print("AN√ÅLISE DE VALORES FALTANTES:")
print("=" * 50)

missing_stats = pd.DataFrame({
    'Coluna': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percent': (df.isnull().sum() / len(df)) * 100
})

missing_stats = missing_stats[missing_stats['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)
print(missing_stats.to_string(index=False))

# Visualiza√ß√£o dos valores faltantes
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Padr√£o de Valores Faltantes no Dataset', fontsize=16, fontweight='bold')
plt.xlabel('Colunas')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Cria√ß√£o do Identificador de Paciente

In [None]:
# Criar identificador √∫nico de paciente baseado em Gender + Age
print("üë§ CRIANDO IDENTIFICADOR DE PACIENTE")
print("=" * 45)

# Verificar se temos Gender e Age
if 'Gender' in df.columns and 'Age' in df.columns:
    # Criar PATIENT_ID como combina√ß√£o de Gender e Age (arredondada)
    df['Age_Rounded'] = df['Age'].round(1)  # Arredondar idade para 1 casa decimal
    df['PATIENT_ID'] = df['Gender'].astype(str) + '_' + df['Age_Rounded'].astype(str)
    
    print(f"‚úÖ PATIENT_ID criado com sucesso!")
    print(f"üë• Total de pacientes √∫nicos: {df['PATIENT_ID'].nunique():,}")
    print(f"üìä Registros por paciente (m√©dia): {len(df) / df['PATIENT_ID'].nunique():.1f}")
    
    # An√°lise da distribui√ß√£o de registros por paciente
    patient_counts = df['PATIENT_ID'].value_counts()
    
    print(f"\nüìà Estat√≠sticas de registros por paciente:")
    print(f"   M√≠nimo: {patient_counts.min()} registros")
    print(f"   M√°ximo: {patient_counts.max()} registros")
    print(f"   Mediana: {patient_counts.median():.1f} registros")
    print(f"   M√©dia: {patient_counts.mean():.1f} registros")
    
else:
    print("‚ùå Colunas Gender ou Age n√£o encontradas!")
    # Criar ID sequencial como fallback
    df['PATIENT_ID'] = range(len(df))
    print("üîÑ Usando ID sequencial como fallback")

In [None]:
# An√°lise da distribui√ß√£o por g√™nero e idade
if 'Gender' in df.columns:
    print("\nüë• AN√ÅLISE DEMOGR√ÅFICA:")
    print("=" * 25)
    
    # Distribui√ß√£o por g√™nero
    gender_dist = df['Gender'].value_counts()
    print("Distribui√ß√£o por G√™nero:")
    for gender, count in gender_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  {gender}: {count:,} ({percentage:.1f}%)")
    
    # Estat√≠sticas de idade
    if 'Age' in df.columns:
        print(f"\nüìä Estat√≠sticas de Idade:")
        print(f"   M√≠nima: {df['Age'].min():.1f} anos")
        print(f"   M√°xima: {df['Age'].max():.1f} anos")
        print(f"   M√©dia: {df['Age'].mean():.1f} anos")
        print(f"   Mediana: {df['Age'].median():.1f} anos")
        
        # Visualiza√ß√£o da distribui√ß√£o de idade por g√™nero
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        sns.histplot(data=df, x='Age', hue='Gender', bins=30, alpha=0.7)
        plt.title('Distribui√ß√£o de Idade por G√™nero', fontweight='bold')
        plt.xlabel('Idade')
        plt.ylabel('Frequ√™ncia')
        
        plt.subplot(1, 2, 2)
        sns.boxplot(data=df, x='Gender', y='Age')
        plt.title('Boxplot de Idade por G√™nero', fontweight='bold')
        plt.ylabel('Idade')
        
        plt.tight_layout()
        plt.show()

## 4. An√°lise da Vari√°vel Target

In [None]:
# An√°lise da vari√°vel target (SepsisLabel)
print("üéØ AN√ÅLISE DA VARI√ÅVEL TARGET (SepsisLabel)")
print("=" * 45)

if 'SepsisLabel' in df.columns:
    target_dist = df['SepsisLabel'].value_counts().sort_index()
    
    print("Distribui√ß√£o da Vari√°vel Target:")
    for label, count in target_dist.items():
        percentage = (count / len(df)) * 100
        status = "Sem Sepsis" if label == 0 else "Com Sepsis"
        print(f"  {status} ({label}): {count:,} ({percentage:.1f}%)")
    
    # Calcular taxa de desbalanceamento
    imbalance_ratio = target_dist.max() / target_dist.min()
    print(f"\n‚öñÔ∏è  Taxa de Desbalanceamento: {imbalance_ratio:.1f}:1")
    
    # An√°lise por paciente √∫nico
    patient_sepsis = df.groupby('PATIENT_ID')['SepsisLabel'].max()  # Se teve sepsis em algum momento
    patient_target_dist = patient_sepsis.value_counts().sort_index()
    
    print(f"\nüë• Distribui√ß√£o por Paciente √önico:")
    for label, count in patient_target_dist.items():
        percentage = (count / len(patient_sepsis)) * 100
        status = "Nunca teve Sepsis" if label == 0 else "Teve Sepsis"
        print(f"  {status}: {count:,} pacientes ({percentage:.1f}%)")
    
    # Visualiza√ß√£o
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Gr√°fico 1: Distribui√ß√£o geral
    target_dist.plot(kind='bar', ax=axes[0], color=['lightblue', 'lightcoral'], alpha=0.8)
    axes[0].set_title('Distribui√ß√£o Geral - SepsisLabel', fontweight='bold')
    axes[0].set_xlabel('SepsisLabel')
    axes[0].set_ylabel('Contagem')
    axes[0].set_xticklabels(['Sem Sepsis', 'Com Sepsis'], rotation=0)
    axes[0].grid(True, alpha=0.3)
    
    # Gr√°fico 2: Distribui√ß√£o por paciente
    patient_target_dist.plot(kind='bar', ax=axes[1], color=['lightgreen', 'salmon'], alpha=0.8)
    axes[1].set_title('Distribui√ß√£o por Paciente √önico', fontweight='bold')
    axes[1].set_xlabel('Status de Sepsis')
    axes[1].set_ylabel('N√∫mero de Pacientes')
    axes[1].set_xticklabels(['Nunca teve', 'Teve Sepsis'], rotation=0)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå Coluna SepsisLabel n√£o encontrada!")

## 5. Imputa√ß√£o Inteligente por Paciente

In [None]:
# Identificar colunas num√©ricas para imputa√ß√£o (excluindo ID e target)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_to_exclude = ['PATIENT_ID', 'SepsisLabel', 'Age_Rounded']
numeric_cols = [col for col in numeric_cols if col not in cols_to_exclude]

print("üîÑ IMPUTA√á√ÉO INTELIGENTE POR PACIENTE")
print("=" * 40)
print(f"üìä Colunas num√©ricas para imputa√ß√£o: {len(numeric_cols)}")
print(f"üìã Colunas: {numeric_cols[:5]}..." if len(numeric_cols) > 5 else f"üìã Colunas: {numeric_cols}")

# Criar c√≥pia do dataframe para imputa√ß√£o
df_imputed = df.copy()

# Fun√ß√£o de imputa√ß√£o por paciente
def impute_patient_data(patient_data):
    """
    Aplica imputa√ß√£o inteligente para um paciente espec√≠fico:
    1. Forward fill (propagar √∫ltimo valor v√°lido)
    2. Backward fill (propagar pr√≥ximo valor v√°lido)
    3. M√©dia do paciente para valores ainda faltantes
    """
    patient_imputed = patient_data.copy()
    
    for col in numeric_cols:
        if col in patient_imputed.columns:
            # 1. Forward fill
            patient_imputed[col] = patient_imputed[col].fillna(method='ffill')
            
            # 2. Backward fill
            patient_imputed[col] = patient_imputed[col].fillna(method='bfill')
            
            # 3. M√©dia do paciente (se ainda houver NaN)
            if patient_imputed[col].isnull().any():
                patient_mean = patient_imputed[col].mean()
                if not pd.isna(patient_mean):
                    patient_imputed[col] = patient_imputed[col].fillna(patient_mean)
    
    return patient_imputed

# Aplicar imputa√ß√£o por paciente
print("\nüîÑ Aplicando imputa√ß√£o por paciente...")

imputed_patients = []
total_patients = df_imputed['PATIENT_ID'].nunique()

for i, (patient_id, patient_data) in enumerate(df_imputed.groupby('PATIENT_ID')):
    if i % 1000 == 0:  # Progress indicator
        print(f"   Processando paciente {i+1}/{total_patients} ({(i+1)/total_patients*100:.1f}%)")
    
    imputed_patient = impute_patient_data(patient_data)
    imputed_patients.append(imputed_patient)

# Combinar todos os pacientes imputados
df_imputed = pd.concat(imputed_patients, ignore_index=True)

print(f"‚úÖ Imputa√ß√£o por paciente conclu√≠da!")

In [None]:
# Comparar missing values antes e depois da imputa√ß√£o por paciente
print("üìä COMPARA√á√ÉO: ANTES vs DEPOIS DA IMPUTA√á√ÉO POR PACIENTE")
print("=" * 60)

missing_before = df[numeric_cols].isnull().sum()
missing_after = df_imputed[numeric_cols].isnull().sum()

comparison = pd.DataFrame({
    'Coluna': numeric_cols,
    'Missing_Antes': missing_before.values,
    'Missing_Depois': missing_after.values,
    'Redu√ß√£o': missing_before.values - missing_after.values,
    'Redu√ß√£o_%': ((missing_before.values - missing_after.values) / missing_before.values * 100).round(1)
})

# Mostrar apenas colunas que tinham missing values
comparison_filtered = comparison[comparison['Missing_Antes'] > 0].sort_values('Redu√ß√£o_%', ascending=False)

if len(comparison_filtered) > 0:
    print(comparison_filtered.to_string(index=False))
    
    total_missing_before = missing_before.sum()
    total_missing_after = missing_after.sum()
    total_reduction = ((total_missing_before - total_missing_after) / total_missing_before * 100)
    
    print(f"\nüìà RESUMO GERAL:")
    print(f"   Total missing antes: {total_missing_before:,}")
    print(f"   Total missing depois: {total_missing_after:,}")
    print(f"   Redu√ß√£o total: {total_reduction:.1f}%")
else:
    print("‚úÖ Nenhum valor faltante encontrado nas colunas num√©ricas!")

## 6. Imputa√ß√£o Global para Valores Remanescentes

In [None]:
# Para valores que ainda est√£o faltando, usar imputa√ß√£o global
print("üåê IMPUTA√á√ÉO GLOBAL PARA VALORES REMANESCENTES")
print("=" * 48)

# Verificar se ainda h√° valores faltantes
remaining_missing = df_imputed[numeric_cols].isnull().sum()
cols_with_missing = remaining_missing[remaining_missing > 0]

if len(cols_with_missing) > 0:
    print(f"üìã Colunas com valores ainda faltantes: {len(cols_with_missing)}")
    print(cols_with_missing.to_string())
    
    # Estrat√©gias de imputa√ß√£o global
    print("\nüîÑ Aplicando estrat√©gias de imputa√ß√£o global...")
    
    for col in cols_with_missing.index:
        missing_count = cols_with_missing[col]
        
        # Estrat√©gia 1: Mediana por g√™nero (se dispon√≠vel)
        if 'Gender' in df_imputed.columns:
            gender_medians = df_imputed.groupby('Gender')[col].median()
            
            for gender in gender_medians.index:
                mask = (df_imputed['Gender'] == gender) & (df_imputed[col].isnull())
                df_imputed.loc[mask, col] = gender_medians[gender]
        
        # Estrat√©gia 2: Mediana global para valores ainda faltantes
        global_median = df_imputed[col].median()
        df_imputed[col] = df_imputed[col].fillna(global_median)
        
        print(f"   ‚úÖ {col}: {missing_count} valores imputados")
    
    # Verifica√ß√£o final
    final_missing = df_imputed[numeric_cols].isnull().sum().sum()
    print(f"\nüìä Valores faltantes ap√≥s imputa√ß√£o global: {final_missing}")
    
else:
    print("‚úÖ Nenhum valor faltante remanescente - imputa√ß√£o por paciente foi suficiente!")

## 7. Feature Engineering

In [None]:
# Feature Engineering: Criar features derivadas
print("‚öôÔ∏è  FEATURE ENGINEERING")
print("=" * 25)

df_engineered = df_imputed.copy()

# 1. Estat√≠sticas por paciente
print("üìä Criando estat√≠sticas por paciente...")

patient_stats = df_engineered.groupby('PATIENT_ID')[numeric_cols].agg({
    col: ['mean', 'std', 'min', 'max', 'count'] for col in numeric_cols
})

# Flatten column names
patient_stats.columns = ['_'.join(col).strip() for col in patient_stats.columns]

# Merge back to original dataframe
df_engineered = df_engineered.merge(patient_stats, on='PATIENT_ID', how='left')

print(f"   ‚úÖ {len(patient_stats.columns)} estat√≠sticas por paciente criadas")

# 2. Features de tend√™ncia (se h√° m√∫ltiplas medi√ß√µes por paciente)
print("\nüìà Criando features de tend√™ncia...")

def calculate_trends(patient_data):
    """Calcula tend√™ncias para um paciente"""
    trends = {}
    
    for col in numeric_cols[:5]:  # Limitar para evitar muitas features
        if col in patient_data.columns and len(patient_data) > 1:
            values = patient_data[col].dropna()
            if len(values) > 1:
                # Tend√™ncia linear simples (diferen√ßa entre √∫ltimo e primeiro valor)
                trend = values.iloc[-1] - values.iloc[0]
                trends[f'{col}_trend'] = trend
            else:
                trends[f'{col}_trend'] = 0
        else:
            trends[f'{col}_trend'] = 0
    
    return pd.Series(trends)

# Calcular tend√™ncias por paciente
patient_trends = df_engineered.groupby('PATIENT_ID').apply(calculate_trends)
patient_trends = patient_trends.reset_index()

# Merge trends back
df_engineered = df_engineered.merge(patient_trends, on='PATIENT_ID', how='left')

print(f"   ‚úÖ {len([col for col in patient_trends.columns if '_trend' in col])} features de tend√™ncia criadas")

# 3. Features de risco (baseadas em conhecimento m√©dico)
print("\nüè• Criando features de risco m√©dico...")

# Idade como fator de risco
if 'Age' in df_engineered.columns:
    df_engineered['Age_Risk'] = (df_engineered['Age'] > 65).astype(int)
    df_engineered['Age_Group'] = pd.cut(df_engineered['Age'], 
                                       bins=[0, 18, 35, 50, 65, 100], 
                                       labels=['Child', 'Young_Adult', 'Adult', 'Middle_Age', 'Senior'])
    
# Combinar sinais vitais (se dispon√≠veis)
vital_signs = ['HR', 'Temp', 'SBP', 'MAP', 'Resp']  # Comuns em dados de sepsis
available_vitals = [col for col in vital_signs if col in df_engineered.columns]

if len(available_vitals) >= 2:
    # Score de instabilidade (soma dos z-scores dos sinais vitais)
    scaler = StandardScaler()
    vitals_scaled = scaler.fit_transform(df_engineered[available_vitals])
    df_engineered['Vitals_Instability_Score'] = np.abs(vitals_scaled).sum(axis=1)
    
    print(f"   ‚úÖ Score de instabilidade criado usando {len(available_vitals)} sinais vitais")

print(f"\nüìä Shape final ap√≥s Feature Engineering: {df_engineered.shape}")
print(f"üìà Novas features criadas: {df_engineered.shape[1] - df_imputed.shape[1]}")

## 8. Detec√ß√£o e Tratamento de Outliers

In [None]:
# Detec√ß√£o de outliers usando IQR
print("üîç DETEC√á√ÉO E TRATAMENTO DE OUTLIERS")
print("=" * 40)

df_no_outliers = df_engineered.copy()

# Selecionar colunas num√©ricas originais para an√°lise de outliers
original_numeric_cols = [col for col in numeric_cols if col in df_no_outliers.columns]

def detect_outliers_iqr(data, column, multiplier=1.5):
    """Detecta outliers usando m√©todo IQR"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)
    return outliers

outlier_summary = []

for col in original_numeric_cols[:10]:  # Analisar primeiras 10 colunas
    outliers = detect_outliers_iqr(df_no_outliers, col)
    outlier_count = outliers.sum()
    outlier_percent = (outlier_count / len(df_no_outliers)) * 100
    
    outlier_summary.append({
        'Coluna': col,
        'Outliers': outlier_count,
        'Porcentagem': outlier_percent
    })
    
    # Tratar outliers usando capping (winsorizing)
    if outlier_percent > 0.5 and outlier_percent < 10:  # S√≥ tratar se entre 0.5% e 10%
        Q1 = df_no_outliers[col].quantile(0.25)
        Q3 = df_no_outliers[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Capping
        df_no_outliers[col] = df_no_outliers[col].clip(lower=lower_bound, upper=upper_bound)
        
        print(f"   üîß {col}: {outlier_count} outliers ({outlier_percent:.1f}%) tratados por capping")

# Resumo de outliers
outlier_df = pd.DataFrame(outlier_summary)
outlier_df = outlier_df[outlier_df['Outliers'] > 0].sort_values('Porcentagem', ascending=False)

if len(outlier_df) > 0:
    print("\nüìä Resumo de Outliers Detectados:")
    print(outlier_df.to_string(index=False, float_format='%.2f'))
else:
    print("‚úÖ Nenhum outlier significativo detectado!")

## 9. Normaliza√ß√£o e Escalonamento

In [None]:
# Normaliza√ß√£o dos dados
print("üìè NORMALIZA√á√ÉO E ESCALONAMENTO")
print("=" * 35)

df_scaled = df_no_outliers.copy()

# Separar features num√©ricas para escalonamento
# Excluir colunas categ√≥ricas e ID
columns_to_exclude = ['PATIENT_ID', 'SepsisLabel', 'Gender', 'Age_Group', 'Age_Rounded']
numeric_features = df_scaled.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [col for col in numeric_features if col not in columns_to_exclude]

print(f"üìä Colunas num√©ricas para escalonamento: {len(numeric_features)}")

# Usar RobustScaler (menos sens√≠vel a outliers)
scaler = RobustScaler()

# Aplicar escalonamento
df_scaled[numeric_features] = scaler.fit_transform(df_scaled[numeric_features])

print(f"‚úÖ Escalonamento aplicado usando RobustScaler")
print(f"üìà Features escalonadas: {len(numeric_features)}")

# Verificar distribui√ß√£o ap√≥s escalonamento
print("\nüìä Estat√≠sticas ap√≥s escalonamento (primeiras 5 colunas):")
print(df_scaled[numeric_features[:5]].describe().round(3))

## 10. Divis√£o dos Dados

In [None]:
# Divis√£o estratificada dos dados
print("üîÄ DIVIS√ÉO DOS DADOS")
print("=" * 20)

# Preparar features e target
if 'SepsisLabel' in df_scaled.columns:
    # Remover colunas n√£o-features
    feature_columns = [col for col in df_scaled.columns 
                      if col not in ['SepsisLabel', 'PATIENT_ID', 'Age_Rounded']]
    
    X = df_scaled[feature_columns]
    y = df_scaled['SepsisLabel']
    
    print(f"üìä Shape das features (X): {X.shape}")
    print(f"üéØ Shape do target (y): {y.shape}")
    
    # Divis√£o estratificada
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        stratify=y, 
        random_state=42
    )
    
    print(f"\nüìà Divis√£o realizada:")
    print(f"   Treino: {X_train.shape[0]:,} amostras ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"   Teste:  {X_test.shape[0]:,} amostras ({X_test.shape[0]/len(X)*100:.1f}%)")
    
    # Verificar balanceamento
    train_dist = y_train.value_counts(normalize=True).sort_index()
    test_dist = y_test.value_counts(normalize=True).sort_index()
    
    print(f"\n‚öñÔ∏è  Distribui√ß√£o do target:")
    print(f"   Treino - Sem Sepsis: {train_dist[0]:.1%}, Com Sepsis: {train_dist[1]:.1%}")
    print(f"   Teste  - Sem Sepsis: {test_dist[0]:.1%}, Com Sepsis: {test_dist[1]:.1%}")
    
else:
    print("‚ùå Coluna SepsisLabel n√£o encontrada para divis√£o!")

## 11. Salvamento dos Dados Processados

In [None]:
# Salvar datasets processados
print("üíæ SALVAMENTO DOS DADOS PROCESSADOS")
print("=" * 38)

# Salvar dataset completo processado
output_file = 'dataset_sepsis_prepared_v2.csv'
df_scaled.to_csv(output_file, index=False)
print(f"‚úÖ Dataset completo salvo: {output_file}")
print(f"   Shape: {df_scaled.shape}")

if 'SepsisLabel' in df_scaled.columns:
    # Salvar conjuntos de treino e teste
    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)
    
    train_file = 'dataset_sepsis_train_v2.csv'
    test_file = 'dataset_sepsis_test_v2.csv'
    
    train_data.to_csv(train_file, index=False)
    test_data.to_csv(test_file, index=False)
    
    print(f"‚úÖ Dataset de treino salvo: {train_file}")
    print(f"   Shape: {train_data.shape}")
    print(f"‚úÖ Dataset de teste salvo: {test_file}")
    print(f"   Shape: {test_data.shape}")
    
    # Salvar informa√ß√µes sobre as features
    feature_info = pd.DataFrame({
        'Feature': feature_columns,
        'Type': [str(X[col].dtype) for col in feature_columns],
        'Missing_Original': [df[col].isnull().sum() if col in df.columns else 0 for col in feature_columns],
        'Missing_Final': [df_scaled[col].isnull().sum() for col in feature_columns]
    })
    
    feature_info.to_csv('features_info_v2.csv', index=False)
    print(f"‚úÖ Informa√ß√µes das features salvas: features_info_v2.csv")

print(f"\nüéâ PROCESSAMENTO CONCLU√çDO COM SUCESSO!")
print(f"üìä Total de features: {len(feature_columns) if 'feature_columns' in locals() else 'N/A'}")
print(f"üè• Total de registros processados: {len(df_scaled):,}")
print(f"üë• Total de pacientes √∫nicos: {df_scaled['PATIENT_ID'].nunique():,}")