In [63]:
import pandas as pd
import seaborn as sns
import numpy as np

df = sns.load_dataset("penguins")

#borrar del dataset todas las filas que tengan algún valor 'NA'.
df = df.dropna()
print(df.head().T)

                           0          1          2          4          5
species               Adelie     Adelie     Adelie     Adelie     Adelie
island             Torgersen  Torgersen  Torgersen  Torgersen  Torgersen
bill_length_mm          39.1       39.5       40.3       36.7       39.3
bill_depth_mm           18.7       17.4       18.0       19.3       20.6
flipper_length_mm      181.0      186.0      195.0      193.0      190.0
body_mass_g           3750.0     3800.0     3250.0     3450.0     3650.0
sex                     Male     Female     Female     Female       Male


In [64]:
# Separar características (X) y variable objetivo (y)
X = df[["island", "sex", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
y = df["species"]

In [65]:
# Convertir las columnas categóricas a formato de diccionario para DictVectorizer
X_categorical = X[["island", "sex"]].to_dict(orient="records")
X_numerical = X[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].values

# Usar DictVectorizer para las columnas categóricas
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_categorical_dv = dv.fit_transform(X_categorical)

# Imprimir los nombres de las columnas codificadas 
print("Columnas después de la codificación one-hot:\n", dv.get_feature_names_out()) 
print("Datos después de la codificación one-hot:\n", X_categorical_dv[:5])

# Escalar variables numéricas
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_numerical_sc = sc.fit_transform(X_numerical)

print("sin escalar numerical",X_numerical)
print("escalado numerical",X_numerical_sc)

# Concatenar columnas categóricas y numéricas
X_final = np.hstack([X_categorical_dv, X_numerical_sc])


Columnas después de la codificación one-hot:
 ['island=Biscoe' 'island=Dream' 'island=Torgersen' 'sex=Female' 'sex=Male']
Datos después de la codificación one-hot:
 [[0. 0. 1. 0. 1.]
 [0. 0. 1. 1. 0.]
 [0. 0. 1. 1. 0.]
 [0. 0. 1. 1. 0.]
 [0. 0. 1. 0. 1.]]
sin escalar numerical [[  39.1   18.7  181.  3750. ]
 [  39.5   17.4  186.  3800. ]
 [  40.3   18.   195.  3250. ]
 ...
 [  50.4   15.7  222.  5750. ]
 [  45.2   14.8  212.  5200. ]
 [  49.9   16.1  213.  5400. ]]
escalado numerical [[-0.89604189  0.7807321  -1.42675157 -0.56847478]
 [-0.82278787  0.11958397 -1.06947358 -0.50628618]
 [-0.67627982  0.42472926 -0.42637319 -1.1903608 ]
 ...
 [ 1.17338426 -0.74499437  1.50292796  1.91906927]
 [ 0.22108196 -1.20271231  0.78837197  1.23499466]
 [ 1.08181673 -0.54156417  0.85982757  1.48374906]]


DIVIDIR CONJUNTO DE ENTRENAMIENTO Y TEST

In [66]:
# Dividir datos en entrenamiento y prueba
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)


----------ENTRENAR MODELOS---------

In [67]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1, max_iter=500)
lr.fit(X_train, y_train)

In [68]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', probability=True, random_state=1)
svm.fit(X_train, y_train)

In [69]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)

In [70]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [71]:
import pickle

with open('../modelos/lr.pck', 'wb') as f:
    pickle.dump((dv, sc, lr), f)

with open('../modelos/svm.pck', 'wb') as f:
    pickle.dump((dv, sc, svm), f)

with open('../modelos/dt.pck', 'wb') as f:
    pickle.dump((dv, sc, dt), f)

with open('../modelos/knn.pck', 'wb') as f:
    pickle.dump((dv, sc, knn), f)