In [2]:
## Pipeline de clasificación sobre Diamons

import pandas as pd
import seaborn as sns
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Carga de datos
df = sns.load_dataset('diamonds')

# Definir variables
X = df.drop(columns=['cut'])  # Ahora predecimos "cut"

# Codificar la variable objetivo
le = LabelEncoder()
y = le.fit_transform(df['cut'])  # Convierte 'cut' en valores numéricos

# Transformaciones
column_transformer = ColumnTransformer([
    ("num_pipeline", make_pipeline(SimpleImputer(strategy='median'), MinMaxScaler()),
     make_column_selector(dtype_include='number')),
    ("cat_pipeline", make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False)),
     make_column_selector(dtype_include=['object', 'category']))
])

# Pipeline con RandomForestClassifier
pipeline = make_pipeline(column_transformer, RandomForestClassifier(random_state=42))
pipeline.fit(X, y)

# Evaluación del modelo en los mismos datos
y_pred = pipeline.predict(X)
accuracy = accuracy_score(y, y_pred)

print('accuracy en train', accuracy)


# Guardar el modelo en la carpeta models/
joblib.dump(pipeline, '../models/pipeline_clasificacion.joblib')


accuracy en train 0.999888765294772


['../models/pipeline_clasificacion.joblib']