In [None]:
# Pr√©-processamento e Feature Engineering
# Employee Attrition Dataset

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif
import warnings
warnings.filterwarnings('ignore')

class AttritionPreprocessor:
    """Classe para pr√©-processamento completo dos dados de attrition"""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.feature_names = None
        self.target_encoder = LabelEncoder()
        
    def load_and_initial_analysis(self, filepath):
        """Carrega dados e faz an√°lise inicial"""
        
        print("="*60)
        print("üîÑ PR√â-PROCESSAMENTO E FEATURE ENGINEERING")
        print("="*60)
        
        # Carregar dados
        self.df = pd.read_csv(filepath)
        print(f"\nüìä Dataset carregado: {self.df.shape[0]} linhas, {self.df.shape[1]} colunas")
        
        # An√°lise inicial
        print(f"\nüîç An√°lise de qualidade:")
        print(f"- Valores ausentes: {self.df.isnull().sum().sum()}")
        print(f"- Duplicatas: {self.df.duplicated().sum()}")
        
        # Identificar tipos de vari√°veis
        self.numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
        
        print(f"- Vari√°veis num√©ricas: {len(self.numeric_cols)}")
        print(f"- Vari√°veis categ√≥ricas: {len(self.categorical_cols)}")
        
        return self.df
    
    def remove_redundant_columns(self):
        """Remove colunas redundantes ou invari√°veis"""
        
        print(f"\nüóëÔ∏è REMO√á√ÉO DE COLUNAS REDUNDANTES:")
        
        # Identificar colunas com valores √∫nicos
        constant_cols = [col for col in self.df.columns if self.df[col].nunique() == 1]
        
        # Colunas espec√≠ficas a remover
        columns_to_remove = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
        columns_to_remove.extend(constant_cols)
        columns_to_remove = list(set(columns_to_remove))  # Remove duplicatas
        
        # Verificar se existem no dataset
        existing_cols_to_remove = [col for col in columns_to_remove if col in self.df.columns]
        
        print(f"Removendo colunas: {existing_cols_to_remove}")
        
        self.df = self.df.drop(columns=existing_cols_to_remove)
        
        # Atualizar listas de colunas
        self.numeric_cols = [col for col in self.numeric_cols if col in self.df.columns]
        self.categorical_cols = [col for col in self.categorical_cols if col in self.df.columns]
        
        print(f"‚úÖ Dataset ap√≥s remo√ß√£o: {self.df.shape}")
        
        return self.df
    
    def create_new_features(self):
        """Cria novas features atrav√©s de feature engineering"""
        
        print(f"\nüõ†Ô∏è FEATURE ENGINEERING:")
        
        # 1. Faixas et√°rias
        self.df['AgeGroup'] = pd.cut(self.df['Age'], 
                                    bins=[0, 25, 35, 45, 55, 100], 
                                    labels=['Young', 'Adult', 'Middle', 'Senior', 'Elder'])
        
        # 2. Faixas salariais
        salary_quartiles = self.df['MonthlyIncome'].quantile([0.25, 0.5, 0.75])
        self.df['SalaryLevel'] = pd.cut(self.df['MonthlyIncome'], 
                                       bins=[0, salary_quartiles[0.25], salary_quartiles[0.5], 
                                            salary_quartiles[0.75], float('inf')],
                                       labels=['Low', 'Medium', 'High', 'VeryHigh'])
        
        # 3. Taxa de crescimento salarial anualizada
        self.df['SalaryGrowthRate'] = (self.df['PercentSalaryHike'] / 100) * self.df['MonthlyIncome']
        
        # 4. Propor√ß√£o de anos na empresa vs idade
        self.df['CompanyTenureRatio'] = self.df['YearsAtCompany'] / (self.df['Age'] - 18 + 1)
        
        # 5. Experi√™ncia total vs anos na empresa atual
        self.df['ExperienceRatio'] = self.df['YearsAtCompany'] / (self.df['TotalWorkingYears'] + 1)
        
        # 6. Indicador de promo√ß√£o recente (√∫ltimo ano)
        self.df['RecentPromotion'] = (self.df['YearsSinceLastPromotion'] <= 1).astype(int)
        
        # 7. Estabilidade de carreira (poucas empresas vs muita experi√™ncia)
        self.df['CareerStability'] = self.df['TotalWorkingYears'] / (self.df['NumCompaniesWorked'] + 1)
        
        # 8. Score de satisfa√ß√£o geral (m√©dia das satisfa√ß√µes)
        satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction']
        if all(col in self.df.columns for col in satisfaction_cols):
            self.df['OverallSatisfaction'] = self.df[satisfaction_cols].mean(axis=1)
        
        # 9. Indicador de alto performer (alta performance + alta satisfa√ß√£o)
        if 'PerformanceRating' in self.df.columns:
            self.df['HighPerformer'] = ((self.df['PerformanceRating'] >= 3) & 
                                       (self.df['JobSatisfaction'] >= 3)).astype(int)
        
        # 10. Dist√¢ncia categorizada
        distance_median = self.df['DistanceFromHome'].median()
        self.df['DistanceCategory'] = np.where(self.df['DistanceFromHome'] > distance_median, 
                                              'Far', 'Near')
        
        # 11. Indicador de workaholic (overtime + high involvement)
        if 'JobInvolvement' in self.df.columns:
            self.df['Workaholic'] = ((self.df['OverTime'] == 'Yes') & 
                                    (self.df['JobInvolvement'] >= 3)).astype(int)
        
        print(f"‚úÖ Criadas {len(self.df.columns) - len(self.numeric_cols) - len(self.categorical_cols)} novas features")
        
        # Atualizar listas de colunas
        self.numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
        
        return self.df
    
    def handle_categorical_variables(self):
        """Processa vari√°veis categ√≥ricas"""
        
        print(f"\nüè∑Ô∏è PROCESSAMENTO DE VARI√ÅVEIS CATEG√ìRICAS:")
        
        # Separar target das outras categ√≥ricas
        categorical_features = [col for col in self.categorical_cols if col != 'Attrition']
        
        print(f"Vari√°veis categ√≥ricas a processar: {len(categorical_features)}")
        
        # Encoding do target
        self.df['Attrition_Binary'] = self.target_encoder.fit_transform(self.df['Attrition'])
        
        # Para vari√°veis com muitas categorias, usar Target Encoding
        high_cardinality_cols = [col for col in categorical_features 
                               if self.df[col].nunique() > 5]
        
        # Para vari√°veis com poucas categorias, usar One-Hot Encoding
        low_cardinality_cols = [col for col in categorical_features 
                              if self.df[col].nunique() <= 5]
        
        print(f"- Alta cardinalidade (Target Encoding): {high_cardinality_cols}")
        print(f"- Baixa cardinalidade (One-Hot): {low_cardinality_cols}")
        
        # Target Encoding para alta cardinalidade
        for col in high_cardinality_cols:
            target_mean = self.df.groupby(col)['Attrition_Binary'].mean()
            self.df[f'{col}_TargetEnc'] = self.df[col].map(target_mean)
        
        # One-Hot Encoding para baixa cardinalidade
        for col in low_cardinality_cols:
            dummies = pd.get_dummies(self.df[col], prefix=col, drop_first=True)
            self.df = pd.concat([self.df, dummies], axis=1)
        
        # Remover colunas categ√≥ricas originais (exceto target)
        cols_to_drop = [col for col in categorical_features]
        self.df = self.df.drop(columns=cols_to_drop)
        
        print(f"‚úÖ Processamento categ√≥rico conclu√≠do. Shape: {self.df.shape}")
        
        return self.df
    
    def handle_outliers(self, method='iqr', threshold=3):
        """Trata outliers nas vari√°veis num√©ricas"""
        
        print(f"\nüéØ TRATAMENTO DE OUTLIERS (m√©todo: {method}):")
        
        numeric_features = [col for col in self.df.select_dtypes(include=[np.number]).columns 
                           if col not in ['Attrition_Binary']]
        
        outliers_info = {}
        
        for col in numeric_features:
            if method == 'iqr':
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers_mask = (self.df[col] < lower_bound) | (self.df[col] > upper_bound)
                
            elif method == 'zscore':
                z_scores = np.abs((self.df[col] - self.df[col].mean()) / self.df[col].std())
                outliers_mask = z_scores > threshold
            
            outliers_count = outliers_mask.sum()
            outliers_info[col] = outliers_count
            
            # Cap outliers (winsorizing) em vez de remover
            if outliers_count > 0:
                if method == 'iqr':
                    self.df[col] = np.clip(self.df[col], lower_bound, upper_bound)
                elif method == 'zscore':
                    self.df[col] = np.clip(self.df[col], 
                                          self.df[col].quantile(0.01), 
                                          self.df[col].quantile(0.99))
        
        # Mostrar estat√≠sticas
        total_outliers = sum(outliers_info.values())
        print(f"Total de outliers identificados: {total_outliers}")
        
        if total_outliers > 0:
            print("Top 5 vari√°veis com mais outliers:")
            sorted_outliers = sorted(outliers_info.items(), key=lambda x: x[1], reverse=True)
            for col, count in sorted_outliers[:5]:
                if count > 0:
                    print(f"  {col}: {count}")
        
        return self.df
    
    def scale_features(self):
        """Normaliza features num√©ricas"""
        
        print(f"\nüìè NORMALIZA√á√ÉO DE FEATURES:")
        
        # Identificar features num√©ricas (exceto target)
        numeric_features = [col for col in self.df.select_dtypes(include=[np.number]).columns 
                           if col not in ['Attrition_Binary', 'Attrition']]
        
        print(f"Normalizando {len(numeric_features)} features num√©ricas")
        
        # Aplicar StandardScaler
        self.df[numeric_features] = self.scaler.fit_transform(self.df[numeric_features])
        
        print(f"‚úÖ Normaliza√ß√£o conclu√≠da")
        
        return self.df
    
    def select_best_features(self, k=20):
        """Seleciona as melhores features usando m√©todos estat√≠sticos"""
        
        print(f"\nüéØ SELE√á√ÉO DE FEATURES (k={k}):")
        
        # Preparar dados
        X = self.df.drop(['Attrition', 'Attrition_Binary'], axis=1, errors='ignore')
        y = self.df['Attrition_Binary']
        
        # Aplicar SelectKBest com f_classif
        selector = SelectKBest(score_func=f_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        
        # Obter nomes das features selecionadas
        selected_features = X.columns[selector.get_support()].tolist()
        feature_scores = selector.scores_[selector.get_support()]
        
        # Criar DataFrame com scores
        feature_importance = pd.DataFrame({
            'Feature': selected_features,
            'Score': feature_scores
        }).sort_values('Score', ascending=False)
        
        print("üèÜ TOP 10 FEATURES SELECIONADAS:")
        print(feature_importance.head(10))
        
        # Manter apenas features selecionadas + target
        self.df_selected = self.df[selected_features + ['Attrition_Binary']].copy()
        self.feature_names = selected_features
        
        print(f"\n‚úÖ Dataset final: {self.df_selected.shape}")
        
        return self.df_selected, feature_importance
    
    def split_data(self, test_size=0.2, val_size=0.1):
        """Divide dados em train/validation/test"""
        
        print(f"\nüîÑ DIVIS√ÉO DOS DADOS:")
        print(f"Train: {1-test_size-val_size:.1%} | Validation: {val_size:.1%} | Test: {test_size:.1%}")
        
        # Preparar X e y
        X = self.df_selected.drop('Attrition_Binary', axis=1)
        y = self.df_selected['Attrition_Binary']
        
        # Primeira divis√£o: train+val vs test
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=self.random_state
        )
        
        # Segunda divis√£o: train vs validation
        if val_size > 0:
            val_size_adjusted = val_size / (1 - test_size)
            X_train, X_val, y_train, y_val = train_test_split(
                X_temp, y_temp, test_size=val_size_adjusted, stratify=y_temp, 
                random_state=self.random_state
            )
        else:
            X_train, X_val, y_train, y_val = X_temp, None, y_temp, None
        
        print(f"‚úÖ Divis√£o conclu√≠da:")
        print(f"  Train: {X_train.shape}")
        if X_val is not None:
            print(f"  Validation: {X_val.shape}")
        print(f"  Test: {X_test.shape}")
        
        # Verificar balanceamento
        print(f"\nüìä Distribui√ß√£o do target:")
        print(f"  Train: {y_train.mean():.3f}")
        if y_val is not None:
            print(f"  Validation: {y_val.mean():.3f}")
        print(f"  Test: {y_test.mean():.3f}")
        
        return X_train, X_val, X_test, y_train, y_val, y_test
    
    def full_preprocessing_pipeline(self, filepath):
        """Pipeline completo de pr√©-processamento"""
        
        print("üöÄ EXECUTANDO PIPELINE COMPLETO DE PR√â-PROCESSAMENTO")
        print("="*70)
        
        # 1. Carregar dados
        self.load_and_initial_analysis(filepath)
        
        # 2. Remover colunas redundantes
        self.remove_redundant_columns()
        
        # 3. Feature engineering
        self.create_new_features()
        
        # 4. Processar categ√≥ricas
        self.handle_categorical_variables()
        
        # 5. Tratar outliers
        self.handle_outliers()
        
        # 6. Normalizar features
        self.scale_features()
        
        # 7. Selecionar features
        df_final, feature_importance = self.select_best_features()
        
        # 8. Dividir dados
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data()
        
        print("\nüéâ PR√â-PROCESSAMENTO CONCLU√çDO COM SUCESSO!")
        print("="*70)
        
        # Retornar dados processados
        return {
            'X_train': X_train, 'X_val': X_val, 'X_test': X_test,
            'y_train': y_train, 'y_val': y_val, 'y_test': y_test,
            'feature_importance': feature_importance,
            'preprocessor': self
        }
    
    def save_processed_data(self, results, output_dir='processed_data/'):
        """Salva dados processados"""
        
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        # Salvar datasets
        results['X_train'].to_csv(f'{output_dir}X_train.csv', index=False)
        results['X_test'].to_csv(f'{output_dir}X_test.csv', index=False)
        results['y_train'].to_csv(f'{output_dir}y_train.csv', index=False)
        results['y_test'].to_csv(f'{output_dir}y_test.csv', index=False)
        
        if results['X_val'] is not None:
            results['X_val'].to_csv(f'{output_dir}X_val.csv', index=False)
            results['y_val'].to_csv(f'{output_dir}y_val.csv', index=False)
        
        # Salvar import√¢ncias
        results['feature_importance'].to_csv(f'{output_dir}feature_importance.csv', index=False)
        
        print(f"üíæ Dados processados salvos em: {output_dir}")


# ========================================
# EXEMPLO DE USO
# ========================================

if __name__ == "__main__":
    
    # Inicializar preprocessador
    preprocessor = AttritionPreprocessor(random_state=42)
    
    # Executar pipeline completo
    results = preprocessor.full_preprocessing_pipeline('IBM_Fn-UseC_-HR-Employee-Attrition.csv')
    
    # Salvar resultados
    preprocessor.save_processed_data(results)
    
    print("\nüìù RESUMO DOS DADOS PROCESSADOS:")
    print(f"Features selecionadas: {len(results['feature_importance'])}")
    print(f"Tamanho do conjunto de treino: {results['X_train'].shape}")
    print(f"Taxa de attrition no treino: {results['y_train'].mean():.3f}")
    
    print("\nüéØ PR√ìXIMOS PASSOS:")
    print("1. Executar modelagem com diferentes algoritmos")
    print("2. Otimizar hiperpar√¢metros")
    print("3. Avaliar interpretabilidade com SHAP")
    print("4. Criar dashboard interativo")