In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def load_data(filepath):
    """Carga el dataset desde un archivo CSV."""
    return pd.read_csv(filepath)

def clean_data(df):
    """Limpia los datos eliminando duplicados y asegurando que la variable objetivo no tenga valores nulos."""
    df = df.drop_duplicates()
    df = df.dropna(subset=['target'])  
    return df

def validate_data(df):
    """Muestra información sobre los tipos de datos y valores nulos."""
    print("Tipos de datos:\n", df.dtypes)
    print("Valores nulos:\n", df.isnull().sum())
    return df

def feature_engineering(df, target_col):
    """Transforma las características mediante escalado y codificación."""
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop(target_col)
    categorical_features = df.select_dtypes(include=['object']).columns
    
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    X_processed = preprocessor.fit_transform(X)
    return X_processed, y

def split_data(X, y, test_size=0.2, random_state=42):
    """Divide los datos en conjuntos de entrenamiento y prueba."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
