In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, LabelBinarizer, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import dill

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# 1. Combinar variables
def combinar_variables(df):
    df['VehicleType'] = df['VehicleType'].fillna('NaN')
    df['TransportVehicleType'] = df['VehicleType']
    df.loc[df['TransportVehicleType'] == 'NaN', 'TransportVehicleType'] = df['Transport']
    df = df.drop(columns=['VehicleType', 'Transport'])
    return df

# 2. Estandarización de variables numéricas
def estandarizar_numericas(df):
    numericas = ['GroceryBill', 'DriveDistance', 'WasteBagCount', 'TVPCHours',
       'ClothesNew', 'InternetHours']
    
    datos_stn = pd.DataFrame(
        StandardScaler().fit_transform(df[numericas]),  # Datos estandarizados
                                    columns=['{}_z'.format(variable) for variable in numericas],  # Nombres de columnas estandarizadas
                                    index=df[numericas].index  # Índices (etiquetas de filas) del DataFrame
    )

    datos_sin_numericas = df.drop(columns=numericas)
    df = pd.concat([datos_sin_numericas, datos_stn], axis=1)

    return df


# 3. Crear columnas para posibles combinaciones de respuestas
def crear_columnas_recycling_cooking(df):
    # Define desired categories
    desired_recycling_categories = ["Plastic", "Metal", "Paper","Glass"]
    desired_cooking_categories = ["Airfryer","Microwave", "Oven", "Grill", "Stove"]

    # Create columns for desired recycling categories
    for item in desired_recycling_categories:
        df[f"Recycling{item}"] = [1 if item in x else 0 for x in df['Recycling']]

    # Create columns for desired cooking categories
    for item in desired_cooking_categories:
        df[f"CookingWith{item}"] = [1 if item in x else 0 for x in df['Cooking_With']]

    df = df.drop(columns=['Recycling', 'Cooking_With'])
    return df

# 4. Codificar variables independientes con una sola respuesta
def codificar_variables(df):
    encoders = {
        "BodyType": OrdinalEncoder(categories=[['underweight', 'normal', 'overweight', 'obese']]),
        "Sex": LabelBinarizer(),
        "ShowerFreq": OrdinalEncoder(categories=[['less frequently', 'more frequently', 'daily', 'twice a day']]),
        "EnergyHeat": OneHotEncoder(categories=[['electricity', 'natural gas', 'coal', 'wood']], sparse_output=False),
        "TransportVehicleType": OneHotEncoder(categories=[['walk/bicycle', 'public','electric', 'hybrid', 'lpg' ,'petrol','diesel']], sparse_output=False),
        "SocialActivity": OrdinalEncoder(categories=[['never', 'sometimes', 'often']]),
        "AirTravelFreq": OrdinalEncoder(categories=[['never', 'rarely', 'frequently', 'very frequently']]),
        "WasteBagSize": OrdinalEncoder(categories=[['small', 'medium', 'large', 'extra large']]),
        "EnergyEfficiency": OrdinalEncoder(categories=[['No','Sometimes','Yes']]),
        "Diet": OneHotEncoder(categories=[['pescatarian', 'vegetarian', 'omnivore', 'vegan']], sparse_output=False)
    }
    # Transform the dataset columns
    for column, encoder in encoders.items():
        if column == 'EnergyHeat' or column == 'TransportVehicleType' or column == 'Diet':
            encoded_data = encoder.fit_transform(df[column].array.reshape(-1, 1))
            encoded_columns = encoder.get_feature_names_out([column])

            onehot_df = pd.DataFrame(encoded_data, 
                                    index = df.index,
                                    columns=encoded_columns)
            onehot_df = onehot_df.astype(int)
            df = df.join(onehot_df)
            df = df.drop(columns=[column])

        else:
            df[column] = encoder.fit_transform(df[column].array.reshape(-1, 1))

    return df

# Crear el pipeline con cada paso de transformación
pipeline = Pipeline(steps=[
    ('combine', FunctionTransformer(combinar_variables)),
    ('scale', FunctionTransformer(estandarizar_numericas)),
    ('recycling_cooking', FunctionTransformer(crear_columnas_recycling_cooking)),
    ('encode', FunctionTransformer(codificar_variables)),
])

In [3]:
# Guardar el pipeline
with open('pipeline.pkl', 'wb') as f:
    dill.dump(pipeline, f)