In [4]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [6]:
# Cargar el dataset de pingüinos desde Seaborn
df = sns.load_dataset("penguins").dropna()  # Eliminar filas con valores NA
df.head() 

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [7]:
# Preparación de los datos
X = df.drop(columns=['species'])  # Usar todas las columnas excepto la especie como características
y = df['species']  # Variable objetivo

In [8]:
# Codificación one-hot para variables categóricas y normalización para numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['flipper_length_mm', 'body_mass_g']),
        ('cat', OneHotEncoder(), ['island', 'sex'])
    ]
)

In [9]:
# Separación en conjuntos de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [20]:
## Regresión Logística
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression(random_state=1))])
lr_pipeline.fit(X_train, y_train)

In [11]:
## SVM
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', SVC(kernel='linear', random_state=1))])
svm_pipeline.fit(X_train, y_train)


In [12]:
## Árboles de Decisión
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', DecisionTreeClassifier(random_state=1))])
dt_pipeline.fit(X_train, y_train)

In [13]:
## KNN
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', KNeighborsClassifier(n_neighbors=3))])
knn_pipeline.fit(X_train, y_train)

In [22]:
# Serialización de los modelos
with open('../models/lr.pck', 'wb') as f:
    pickle.dump(lr_pipeline, f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump(svm_pipeline, f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump(dt_pipeline, f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump(knn_pipeline, f)