In [8]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer, make_column_transformer
import joblib

In [None]:
df = sns.load_dataset('tips')

X = df.drop('tip', axis=1)
y = df['tip']

# Sacamos la 'X' y la 'y'
categorical_cols = ['sex', 'smoker', 'day', 'time']
numerical_cols = ['total_bill', 'size']

# Hacemos los pipeline
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)

# Transformamos las columnas
column_transformer = make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

# Montamos el pipeline ya con los datos preparados
pipeline = make_pipeline(column_transformer, RandomForestRegressor(random_state=42))

# Entrenamos el pipeline
pipeline.fit(X, y)

# Mostramos las metricas
print('R2 en train', pipeline.score(X,y))

# Creamo un archivo Joblib para usarlo en app.py
joblib.dump(pipeline, 'pipeline.joblib')

R2 en train 0.9176612396276067


['pipeline.joblib']

In [None]:
# Alternativa más corta
from sklearn.compose import make_column_selector


column_transformer = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        make_column_selector(dtype_include='number') # detecta automaticamente columnas numéricas
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False)
        ),
        make_column_selector(dtype_include=['object', 'category']) # detecta automaticamente columnas categóricas
    )
)

pipeline = make_pipeline(column_transformer, RandomForestRegressor(random_state=42))
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))
joblib.dump(pipeline, 'pipeline.joblib')