<a href="https://colab.research.google.com/github/lolaprieto/luludelacream/blob/main/IAA_Clase_2_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 1. Análisis Exploratorio de Datos

In [None]:
# Cargar el dataset
data = pd.read_csv('/content/drive/MyDrive/penguins_size.csv')
data.sample(5)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().style.format("{:0.2f}")

**Datos Faltantes**

In [None]:
data.isna().sum()

In [None]:
data.dropna(inplace=True)

# Cambia el shape?


**Especies**

In [None]:
data.species.value_counts()

Hipótesis 1: La masa es un buen atributo para diferenciar las especies.

In [None]:
data.groupby('species').body_mass_g.mean()

In [None]:
data.body_mass_g.hist(bins=20, rwidth=0.9)
plt.xlabel('Peso (g)')
plt.ylabel('Cuentas');

In [None]:
sns.displot(data, x='body_mass_g', hue='species', kind='hist', bins=20)
plt.xlabel('Peso (g)')
plt.ylabel('Cuentas');

Hipótesis 2: Características del pico son buenos atributos para diferenciar las especies.

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))

sns.histplot(data, x='culmen_length_mm', hue='species', ax=axs[0])
sns.histplot(data, x='culmen_depth_mm', hue='species', ax=axs[1])


¿Y si las combinamos?

In [None]:
sns.scatterplot(data=data, x='culmen_length_mm', y='culmen_depth_mm', hue='species')

Todas juntas

In [None]:
sns.pairplot(data, hue='species', diag_kind='hist')

### Entrenamiento de un modelo de clasificación

**Dos atributos**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split # Mas sobre esto el martes!

In [None]:
# Separar características y etiqueta
X = data[['culmen_length_mm', 'culmen_depth_mm']]
y = data['species']

In [None]:
# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Crear y entrenar el modelo de Árbol de Decisión
tree_model = DecisionTreeClassifier(random_state=42, max_depth=3)
tree_model.fit(X_train, y_train)

In [None]:
# Visualizar árbol
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(tree_model, feature_names=X.columns, class_names=y.unique(),
          filled=True, impurity=True, rounded=True)
plt.show()

**Fronteras de decisión**

In [None]:
# Función que nos ayuda a graficar
def visualize_classifier(model, X, y, ax=None, proba=False):
    col1 = X.columns[0]
    col2 = X.columns[1]

    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values
    ax = ax or plt.gca()
    colors_tab10 = plt.cm.tab10.colors

    for i, label in enumerate(np.unique(model.classes_)):
        ax.scatter(X[y==label, 0], X[y==label, 1], s=30,
                   zorder=3, alpha=0.5, color=colors_tab10[i], label=label)

    ax.axis('tight')
    ax.set_xlabel(col1)
    ax.set_ylabel(col2)
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    xx, yy = np.meshgrid(np.linspace(*xlim, num=200), np.linspace(*ylim, num=200))
    Z = model.predict(pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=[col1, col2])).reshape(xx.shape)
    for i, label in enumerate(np.unique(model.classes_)):
        Z = np.where(Z == label, i, Z)

    Z = Z.astype(np.float32)
    ax.pcolormesh(xx, yy, Z, cmap='tab10', alpha=0.3)
    ax.set(xlim=xlim, ylim=ylim)

In [None]:
visualize_classifier(tree_model, X_train, y_train)

**Evaluación del modelo**

In [None]:
# Predecir y evaluar el modelo
y_test_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
print("Accuracy del modelo:", accuracy)

**Importancia de cada atributo**

In [None]:
# Gráfico de barras con la importancia de cada atributo, ordenadas
importances = tree_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure()
plt.title("Importancia de los atributos")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), np.array(X.columns)[indices], rotation=90)
plt.show()