## 1) Preparación de los datos

In [176]:
import pandas as pd

# df = pd.read_csv("https://raw.githubusercontent.com/tnavarrete-iedib/bigdata-24-25/refs/heads/main/penguins_size.csv")
df = pd.read_csv("../datasets/penguins_size.csv")
print(len(df))
df.head().T

344


Unnamed: 0,0,1,2,3,4
species,Adelie,Adelie,Adelie,Adelie,Adelie
island,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen
culmen_length_mm,39.1,39.5,40.3,,36.7
culmen_depth_mm,18.7,17.4,18.0,,19.3
flipper_length_mm,181.0,186.0,195.0,,193.0
body_mass_g,3750.0,3800.0,3250.0,,3450.0
sex,MALE,FEMALE,FEMALE,,FEMALE


In [177]:
# Eliminar los valores NaN
df = df.dropna()

# Eliminar valores donde sex = '.'
df = df[df['sex'] != '.']

df.head().T

Unnamed: 0,0,1,2,4,5
species,Adelie,Adelie,Adelie,Adelie,Adelie
island,Torgersen,Torgersen,Torgersen,Torgersen,Torgersen
culmen_length_mm,39.1,39.5,40.3,36.7,39.3
culmen_depth_mm,18.7,17.4,18.0,19.3,20.6
flipper_length_mm,181.0,186.0,195.0,193.0,190.0
body_mass_g,3750.0,3800.0,3250.0,3450.0,3650.0
sex,MALE,FEMALE,FEMALE,FEMALE,MALE


In [178]:
# Variable objetivo: species
# df.dtypes
print(df.species.unique())

# df.species = (df.species == 'Adelie').astype(int)
# df.species = (df.species).replace('Adelie', '0', regex=True)
# df.species = (df.species).replace('Chinstrap', '1', regex=True)
# df.species = (df.species).replace('Gentoo', '2', regex=True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

print(df.species.unique())
df.species.head()

['Adelie' 'Chinstrap' 'Gentoo']
[0 1 2]


0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [179]:
# Separar los datos: 80% entrenamiento, 20% evaluación
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.2, random_state=1)

y_train = df_train.species.values
y_val = df_val.species.values

del df_train['species']
del df_val['species']

print(len(df_train))
df_train.head().T

266


Unnamed: 0,306,168,181,167,19
island,Biscoe,Dream,Dream,Dream,Torgersen
culmen_length_mm,43.4,50.3,52.8,50.5,46.0
culmen_depth_mm,14.4,20.0,20.0,19.6,21.5
flipper_length_mm,218.0,197.0,205.0,201.0,194.0
body_mass_g,4600.0,3300.0,4550.0,4050.0,4200.0
sex,FEMALE,MALE,MALE,MALE,MALE


## 2) Ingeniería de propiedades

In [180]:
# Codificación one-hot
categorical = ['sex', 'island']
numerical = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'] 

train_dict = df_train[categorical + numerical].to_dict(orient='records')  
print(train_dict[0])

val_dict = df_val[categorical + numerical].to_dict(orient='records')  
val_dict[0]

{'sex': 'FEMALE', 'island': 'Biscoe', 'culmen_length_mm': 43.4, 'culmen_depth_mm': 14.4, 'flipper_length_mm': 218.0, 'body_mass_g': 4600.0}


{'sex': 'MALE',
 'island': 'Biscoe',
 'culmen_length_mm': 41.6,
 'culmen_depth_mm': 18.0,
 'flipper_length_mm': 192.0,
 'body_mass_g': 3950.0}

In [181]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
dv.fit(val_dict)

In [182]:
X_train = dv.transform(train_dict)
print(X_train[0])

X_val = dv.transform(val_dict)
X_val[0]

[4.60e+03 1.44e+01 4.34e+01 2.18e+02 1.00e+00 0.00e+00 0.00e+00 1.00e+00
 0.00e+00]


array([3.95e+03, 1.80e+01, 4.16e+01, 1.92e+02, 1.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 1.00e+00])

In [183]:
dv.get_feature_names_out()

array(['body_mass_g', 'culmen_depth_mm', 'culmen_length_mm',
       'flipper_length_mm', 'island=Biscoe', 'island=Dream',
       'island=Torgersen', 'sex=FEMALE', 'sex=MALE'], dtype=object)

In [184]:
# Normalizar: media = 0, desviación típica = 1
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_val)

print(X_train_std[:5])
print(X_test_std[:5])

[[ 0.46895167 -1.40321023 -0.08036036  1.18032759  0.99250926 -0.75146915
  -0.39562828  0.99250926 -0.99250926]
 [-1.10774729  1.48583187  1.1685254  -0.27322398 -1.00754728  1.33072662
  -0.39562828 -1.00754728  1.00754728]
 [ 0.40830941  1.48583187  1.62102024  0.28050995 -1.00754728  1.33072662
  -0.39562828 -1.00754728  1.00754728]
 [-0.19811327  1.27947172  1.20472499  0.00364299 -1.00754728  1.33072662
  -0.39562828 -1.00754728  1.00754728]
 [-0.01618647  2.25968244  0.39023427 -0.4808742  -1.00754728 -0.75146915
   2.52762515 -1.00754728  1.00754728]]
[[-0.31939781  0.45403112 -0.40615665 -0.61930769  0.99250926 -0.75146915
  -0.39562828 -1.00754728  1.00754728]
 [ 0.10509807 -1.66116042 -0.00796119  0.48816018  0.99250926 -0.75146915
  -0.39562828  0.99250926 -0.99250926]
 [-0.74389368  0.76357135  1.05992664 -0.41165746 -1.00754728  1.33072662
  -0.39562828 -1.00754728  1.00754728]
 [-0.98646276  0.40244108  1.13232581 -0.75774117 -1.00754728  1.33072662
  -0.39562828  0.9925

## 3) Entrenamiento del modelo

In [185]:
# Regresión logística
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100.0,
                        random_state = 1,
                        solver = 'lbfgs',
                        multi_class='ovr')

lr.fit(X_train_std, y_train)



In [186]:
# SVM
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True)

svm.fit(X_train_std, y_train)

In [187]:
# Decission Trees
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=1)
dt.fit(X_train_std, y_train)

In [188]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train_std, y_train)

## 4) Serialización del modelo

In [189]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((sc, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((sc, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((sc, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((sc, knn), f)