Codificar native countri como se propone recodificar esta variable como un booleano: `es_estadounidense` vs. `no_lo_es`

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import joblib
from utils import *

In [23]:
df=pd.read_csv("..\data\income_data.csv")

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [25]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [26]:
df2 = df.copy()
df2['es_estadounidense'] = df2['native-country'] == 'United-States'
df2.drop(columns=['native-country'], inplace=True)

In [27]:
df2.rename(columns={'gender': 'sex'}, inplace=True)

In [28]:
df2['has_capital_gain'] = df2['capital-gain'] > 0
df2['has_capital_loss'] = df2['capital-loss'] > 0
df2.drop(columns=['capital-gain', 'capital-loss'], inplace=True)

In [29]:
df2["income"].unique()

array(['<=50K', '>50K'], dtype=object)

In [30]:
df2["income"] = df2["income"].map({'<=50K': 0,'>50K': 1})

Antes de realizar ninguna transformación es necesario particionar el conjunto de datos

In [31]:
# Dividir en características (X) y objetivo (y)
X = df2.drop(columns=['income'])
y = df2['income']

In [32]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
print("Tamaño de y_train:", y_train.shape)
print("Tamaño de y_test:", y_test.shape)

Tamaño de y_train: (39073,)
Tamaño de y_test: (9769,)


In [34]:
df2.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'hours-per-week', 'income', 'es_estadounidense', 'has_capital_gain',
       'has_capital_loss'],
      dtype='object')

In [35]:
df2.select_dtypes(include=[np.number]).columns

Index(['age', 'fnlwgt', 'educational-num', 'hours-per-week', 'income'], dtype='object')

In [36]:
df2.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'hours-per-week', 'income', 'es_estadounidense', 'has_capital_gain',
       'has_capital_loss'],
      dtype='object')

In [37]:
# Numéricas: Winsorizar + Log + Escalar
log_transform_columns = ['age', 'fnlwgt']

# Numéricas: Solo escalar
scale_only_columns = ['educational-num', 'hours-per-week']

# Ordinales
ordinal_columns = ['marital-status', 'sex']

# Categóricas con alta cardinalidad: Target Encoding
target_encoding_columns = ['education', 'occupation', 'relationship', 'race', 'workclass']

# Categóricas con baja cardinalidad (para One-Hot)
one_hot_columns = []  # De momento no tienes ninguna en esta categoría; deja vacío o añade si necesitas

# Binarias directas (sin cambios)
binary_columns = ['has_capital_gain', 'has_capital_loss', 'es_estadounidense']


In [38]:
preprocessor = ColumnTransformer(transformers=[
    
    # Ordinal Encoder
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Never-married', 'Separated', 'Divorced', 'Widowed', 'Married-civ-spouse'],
            ['Female', 'Male']
        ],
        handle_unknown='use_encoded_value', unknown_value=-1
    ), ordinal_columns),

    # Target Encoder
    ('target_enc', TargetEncoder(), target_encoding_columns),

    # One-Hot Encoder (si necesitas en el futuro)
    ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns),

    # Winsorización + Log + Escalado
    ('winsor_log_scale', Pipeline([
        ('winsor', Winsorizer(lower=0.01, upper=0.99)),
        ('log1p', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
        ('scaler', StandardScaler())
    ]), log_transform_columns),

    # Solo escalado
    ('scaler_only', StandardScaler(), scale_only_columns),

    # Variables binarias directamente como passthrough
    ('binary', 'passthrough', binary_columns)
])

In [39]:
X_train_proc = preprocessor.fit_transform(X_train, y_train)
X_test_proc  = preprocessor.transform(X_test)

In [40]:
#Exportar X_train_proc y X_test_proc a preprocesing en csv
columns = preprocessor.get_feature_names_out()

# Convertir a DataFrame con nombres
X_train_proc_df = pd.DataFrame(X_train_proc, columns=columns)
X_test_proc_df = pd.DataFrame(X_test_proc, columns=columns)

# Exportar a CSV con nombres correctos
X_train_proc_df.to_csv("../preprocessing/X_train_proc.csv", index=False)
X_test_proc_df.to_csv("../preprocessing/X_test_proc.csv", index=False)

In [41]:
joblib.dump(preprocessor, "../preprocessing/preprocesador.pkl")

['../preprocessing/preprocesador.pkl']

In [42]:
# Guardar los datos crudos con columnas (como CSV)
X_train.to_csv("../preprocessing/X_train_raw.csv", index=False)
X_test.to_csv("../preprocessing/X_test_raw.csv", index=False)

# Guardar los targets como arrays (manteniendo el formato actual)
y_train.to_csv("../preprocessing/y_train_raw.csv", index=False)
y_test.to_csv("../preprocessing/y_test_raw.csv", index=False)