<a href="https://colab.research.google.com/github/pietrogad/ICON24-25/blob/main/Classificazione_Pinguini_DT_ANN_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
from google.colab import drive
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [20]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
path = "/content/drive/MyDrive/Pinguin/dataset/penguins_size.csv"

In [22]:
# ======================
# PREPARAZIONE DEI DATI
# ======================

dataset = pd.read_csv(path)

#feature selection - copertura dei valori NaN
dataset.drop(columns=["island","sex"], inplace=True)
dataset["body_mass_g"].fillna(np.mean(dataset["body_mass_g"]), inplace=True)
dataset["flipper_length_mm"].fillna(np.mean(dataset["flipper_length_mm"]), inplace=True)
dataset["culmen_depth_mm"].fillna(np.mean(dataset["culmen_depth_mm"]), inplace=True)
dataset["culmen_length_mm"].fillna(np.mean(dataset["culmen_length_mm"]), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["body_mass_g"].fillna(np.mean(dataset["body_mass_g"]), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["flipper_length_mm"].fillna(np.mean(dataset["flipper_length_mm"]), inplace=True)
The behavior will change in pandas 3.0. This inplace method will n

In [23]:
dataset.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181.0,3750.0
1,Adelie,39.5,17.4,186.0,3800.0
2,Adelie,40.3,18.0,195.0,3250.0
3,Adelie,43.92193,17.15117,200.915205,4201.754386
4,Adelie,36.7,19.3,193.0,3450.0


In [24]:
# Preparazione dati
X = dataset.drop('species', axis=1).values
y = dataset['species'].values

In [25]:
# Codifica delle etichette
le = LabelEncoder()
y_encoded = le.fit_transform(y)
species_names = le.classes_
print(species_names)

['Adelie' 'Chinstrap' 'Gentoo']


In [26]:
# ======================
# CONFIGURAZINE K-FOLD
# ======================

n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Strutture per memorizzare i risultati
results = {
    'Decision Tree': {'acc': [], 'f1': []},
    'Naive Bayes': {'acc': [], 'f1': []},
    'ANN': {'acc': [], 'f1': []}
}

In [27]:
# ======================
# K-FOLD CROSS VALIDATION
# ======================

print(f"INIZIO K-FOLD CROSS VALIDATION (k={n_splits})")
print("="*70)

for fold, (train_idx, test_idx) in enumerate(kfold.split(X)):
    print(f"\nFOLD {fold+1}/{n_splits}")
    print("-"*50)

    # Split dati
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

    # Normalizzazione
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # ======================
    # 1. DECISION TREE
    # ======================

    dt_model = DecisionTreeClassifier(
        max_depth=3,
        min_samples_split=10,
        random_state=42
    )
    dt_model.fit(X_train_scaled, y_train)
    y_pred_dt = dt_model.predict(X_test_scaled)

    # Metriche Decision Tree
    dt_acc = accuracy_score(y_test, y_pred_dt)
    dt_f1 = f1_score(y_test, y_pred_dt, average='macro')
    results['Decision Tree']['acc'].append(dt_acc)
    results['Decision Tree']['f1'].append(dt_f1)

    print(f"Decision Tree - Precisione: {dt_acc:.4f}, F1: {dt_f1:.4f}")

    # ======================
    # 2. NAIVE BAYES
    # ======================

    nb_model = GaussianNB()
    nb_model.fit(X_train_scaled, y_train)
    y_pred_nb = nb_model.predict(X_test_scaled)

    # Metriche Naive Bayes
    nb_acc = accuracy_score(y_test, y_pred_nb)
    nb_f1 = f1_score(y_test, y_pred_nb, average='macro')
    results['Naive Bayes']['acc'].append(nb_acc)
    results['Naive Bayes']['f1'].append(nb_f1)

    print(f"Naive Bayes  - Precisione: {nb_acc:.4f}, F1: {nb_f1:.4f}")

    # ======================
    # 3. ANN
    # ======================

    # Preparazione dati per ANN
    y_train_cat = to_categorical(y_train)
    y_test_cat = to_categorical(y_test)

    # Creazione modello
    nn_model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(len(species_names), activation='softmax')
    ])

    # Compilazione
    nn_model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Addestramento
    nn_model.fit(
        X_train_scaled, y_train_cat,
        epochs=100,
        batch_size=16,
        validation_split=0.2,
        verbose=0
    )

    # Valutazione
    _, nn_acc = nn_model.evaluate(X_test_scaled, y_test_cat, verbose=0)
    y_pred_nn = nn_model.predict(X_test_scaled)
    y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)
    nn_f1 = f1_score(y_test, y_pred_nn_classes, average='macro')
    results['ANN']['acc'].append(nn_acc)
    results['ANN']['f1'].append(nn_f1)

    print(f"ANN - Precisione: {nn_acc:.4f}, F1: {nn_f1:.4f}")

INIZIO K-FOLD CROSS VALIDATION (k=5)

FOLD 1/5
--------------------------------------------------
Decision Tree - Precisione: 0.9275, F1: 0.9227
Naive Bayes  - Precisione: 0.9565, F1: 0.9538


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
ANN - Precisione: 0.9710, F1: 0.9688

FOLD 2/5
--------------------------------------------------
Decision Tree - Precisione: 0.9565, F1: 0.9412
Naive Bayes  - Precisione: 0.9565, F1: 0.9308


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
ANN - Precisione: 0.9855, F1: 0.9890

FOLD 3/5
--------------------------------------------------
Decision Tree - Precisione: 0.9565, F1: 0.9508
Naive Bayes  - Precisione: 0.9565, F1: 0.9454


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
ANN - Precisione: 0.9855, F1: 0.9818

FOLD 4/5
--------------------------------------------------
Decision Tree - Precisione: 0.9565, F1: 0.9480
Naive Bayes  - Precisione: 0.9855, F1: 0.9827


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
ANN - Precisione: 0.9855, F1: 0.9827

FOLD 5/5
--------------------------------------------------
Decision Tree - Precisione: 0.9559, F1: 0.9505
Naive Bayes  - Precisione: 0.9853, F1: 0.9832


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
ANN - Precisione: 0.9706, F1: 0.9705
