# EDA

1. __Cargar los datos__

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve


<span style="color:pink">Definir la ruta del archivo y cargar dataset</span>


In [None]:
file_path = "../data/train_test/attrition_availabledata_29.csv"
df = pd.read_csv(file_path)

2. __Información general del dataset__

<span style="color:pink"> Ver las primeras filas del dataset</span>

In [None]:
df.head()

<span style="color:pink">Información general sobre las variables</span>

In [None]:
df.info()

3. __identificar el numero de variables e instancias__

In [None]:
num_filas, num_columnas = df.shape
print(f"Número de instancias: {num_filas}")
print(f"Número de variables: {num_columnas}")

4. __Clasificación de variables__

<span style="color:pink">Qué variables son categóricas/ordinales/numéricas</span>

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Variables categóricas: {categorical_cols}")
print(f"Variables numéricas: {numerical_cols}")

5. __Detención de valores nulos__

<span style="color:pink"> Contar valores nulos por columna</span>

In [None]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(f"Valores faltantes: \n{missing_values}")

6. __Identificar columnas constantes o IDs__

<span style="color:pink"> Identificar columnas con un solo valor</span>

In [None]:
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(f"Columnas constantes: {constant_cols}")

<span style="color:pink"> Eliminar las variables constantes </span>

In [None]:
cols_to_drop = ['EmployeeCount', 'StandardHours']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

7. __Análisis de balance de clases__ (si es un problema de clasificación)

<span style= "color:pink"> Verificar si el dataset esta desbalanceado</span>

In [None]:
if 'Attrition' in df.columns:
    plt.figure(figsize=(4,4))
    sns.countplot(x='Attrition', data=df, palette='pastel', hue='Attrition', legend=False)
    plt.title("Distribución de la variable objetivo")
    plt.show()
    
    print(df['Attrition'].value_counts(normalize=True) * 100)

8. __Estadísticas descriptivas__

<span style= "color: pink">Estadísticas básicas de las variables numéricas</span>

In [None]:
df.describe()

9. __Visualización de la distribución de variables numéricas__

<span style= "color:pink">Histogramas para visualizar la distribución de las variables numéricas</span>

In [None]:

numerical_cols = [col for col in numerical_cols if col in df.columns]

df[numerical_cols].hist(figsize=(15,30), bins=20)
plt.suptitle("Distribución de Variables Numéricas")
plt.show()

# Evaluatión Strategy 

<span style= "color:pink"><strong>Configuración de semilla</strong></span>

In [None]:
RANDOM_STATE = 100474933
np.random.seed(RANDOM_STATE)

<span style= "color:pink"><strong>Separar variables predictoras y objetivo</strong></span>

In [None]:
X = df.drop(columns=['Attrition'])
y = df['Attrition']

<span style= "color:pink"><strong>División en Train/Test manteniendo la proporción de la clase (estratificación)</strong></span>

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=1/3, random_state=RANDOM_STATE)

<span style= "color:pink"><strong> Filtrar columnas existentes en X_train</strong></span>

In [None]:
numerical_cols = [col for col in numerical_cols if col in X_train.columns]
categorical_cols = [col for col in categorical_cols if col in X_train.columns]

print(f"Variables categóricas filtradas: {categorical_cols}")
print(f"Variables numéricas filtradas: {numerical_cols}")

<span style= "color:pink"><strong>Pipelines de preprocesamiento</strong></span>

In [None]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('pca', PCA(n_components=5))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])


<span style= "color:pink"><strong>Combinación de transformaciones</strong></span>

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

<span style= "color:pink"><strong>Definir pipeline de modelo</strong></span>

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE))
])

<span style= "color:pink"><strong>Evaluación inicial con validación cruzada</strong></span>

In [None]:
cv_score = cross_val_score(clf, X_train, y_train, cv=5, scoring='balanced_accuracy').mean()
print(f"Balanced Accuracy (CV): {cv_score:.4f}")

<span style= "color:pink"><strong>..</strong></span>