Aquestes són les dades de cada individu:

    species (espècie de pingüí: Chinstrap, Adélie o Gentoo)
    island (illa: Dream, Torgersen o Biscoe)
    bill_length_mm (longitud del bec en mm)
    bill_depth_mm (profunditat del bec en mm)
    flipper_length_mm (longitud de l'aleta en mm)
    body_mass_g (massa corporal en grams)
    sex (sexe: Male o Female)

Nota: els camps del dataset de Kaggle són lleugerament diferents: en lloc de bill_length_mm i bill_depth_mm es diuen culmen_length_mm i culmen_depth_mm


In [31]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Carreguem el dataset dels pingüins amb Seaborn
df = sns.load_dataset("penguins")

# Transformem el dataset de Seaborn a un DataFrame de pandas
df = pd.DataFrame(df)

# Eliminem les files amb valors nuls
df = df.dropna()

# Mostrem les primeres files del DataFrame
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [32]:
# Mostrem els tipus de dades de cada columna del DataFrame
df.dtypes

species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [33]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f'Training set size: {len(train_df)}')
print(f'Test set size: {len(test_df)}')

# Codificar la variable objectiu 'species' com a valors numèrics
label_encoder = LabelEncoder()
train_df['species'] = label_encoder.fit_transform(train_df['species'])
test_df['species'] = label_encoder.transform(test_df['species'])


Training set size: 266
Test set size: 67


In [34]:
# Mostrar les primeres files del conjunt d'entrenament
train_df.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
230,2,Biscoe,40.9,13.7,214.0,4650.0,Female
84,0,Dream,37.3,17.8,191.0,3350.0,Female
303,2,Biscoe,50.0,15.9,224.0,5350.0,Male
22,0,Biscoe,35.9,19.2,189.0,3800.0,Female
29,0,Biscoe,40.5,18.9,180.0,3950.0,Male


In [35]:
# Mostrar les primeres files del conjunt de test
test_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
30,0,Dream,39.5,16.7,178.0,3250.0,Female
317,2,Biscoe,46.9,14.6,222.0,4875.0,Female
79,0,Torgersen,42.1,19.1,195.0,4000.0,Male
201,1,Dream,49.8,17.3,198.0,3675.0,Female
63,0,Biscoe,41.1,18.2,192.0,4050.0,Male


In [36]:
# One-hot encode categorical variables and standard scale numerical variables
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# Definim les característiques categòriques i numèriques
categorical_features = ['island', 'sex']
numerical_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

# Inicialitzem els preprocessadors
dict_vectorizer = DictVectorizer(sparse=False)
scaler = StandardScaler()

# Convertim els DataFrames a llistes de diccionaris
train_dicts = train_df.drop('species', axis=1).to_dict(orient='records')
test_dicts = test_df.drop('species', axis=1).to_dict(orient='records')

# Ajustem i transformem les dades d'entrenament, després transformem les dades de prova
X_train = dict_vectorizer.fit_transform(train_dicts)
X_test = dict_vectorizer.transform(test_dicts)

# Imprimim les primeres cinc files de les dades transformades
print(X_train[:5])
print(X_test[:5])

[[1.37e+01 4.09e+01 4.65e+03 2.14e+02 1.00e+00 0.00e+00 0.00e+00 1.00e+00
  0.00e+00]
 [1.78e+01 3.73e+01 3.35e+03 1.91e+02 0.00e+00 1.00e+00 0.00e+00 1.00e+00
  0.00e+00]
 [1.59e+01 5.00e+01 5.35e+03 2.24e+02 1.00e+00 0.00e+00 0.00e+00 0.00e+00
  1.00e+00]
 [1.92e+01 3.59e+01 3.80e+03 1.89e+02 1.00e+00 0.00e+00 0.00e+00 1.00e+00
  0.00e+00]
 [1.89e+01 4.05e+01 3.95e+03 1.80e+02 1.00e+00 0.00e+00 0.00e+00 0.00e+00
  1.00e+00]]
[[1.670e+01 3.950e+01 3.250e+03 1.780e+02 0.000e+00 1.000e+00 0.000e+00
  1.000e+00 0.000e+00]
 [1.460e+01 4.690e+01 4.875e+03 2.220e+02 1.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00]
 [1.910e+01 4.210e+01 4.000e+03 1.950e+02 0.000e+00 0.000e+00 1.000e+00
  0.000e+00 1.000e+00]
 [1.730e+01 4.980e+01 3.675e+03 1.980e+02 0.000e+00 1.000e+00 0.000e+00
  1.000e+00 0.000e+00]
 [1.820e+01 4.110e+01 4.050e+03 1.920e+02 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 1.000e+00]]
