# Actividad Práctica Experimental Nro 4


Esta práctica tiene como objetivo profundizar en el algoritmo de aprendizaje de árboles de decisión.



## Preparación del Entorno

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  classification_report, f1_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn import tree

## Divisón de Conjunto de Datos

In [None]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

In [None]:
df["class"] = data.target
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={"shrink": .8})
plt.title("Matriz de Correlación")
plt.show()

In [None]:
x = df.drop("class", axis=1)
y = df["class"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

## Construcción del Árbol de Decisión

In [None]:
model = DecisionTreeClassifier(criterion="entropy", max_depth=4)

## Entrenamiento del Modelo

In [None]:
model.fit(x_train, y_train)

## Evaluación del Modelo

In [None]:
y_pred = model.predict(x_test)
y_pred

### Matriz de Confusión

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
disp.plot()
plt.show()

### Precisión y Recall

In [None]:
precision_original = precision_score(y_test, y_pred, average='macro')
recall_original = recall_score(y_test, y_pred, average='macro')

print(f"precision: {precision_original}")
print(f"recall: {recall_original}")

### F1 Score

In [None]:
f1_original = f1_score(y_test, y_pred, average="macro")
print(f"F1 Score : {f1_original}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(15, 15))
tree.plot_tree(model, filled=True, class_names= ["0", "1", "2"], feature_names=x.columns)
plt.show()

## Optimización


Se modifican los hiperparámetros del modelo para conseguir un modelo más acertado. Para ello se hace uso de Grid Search Cross-validation, un modelo que permite encontrar los mejores hiperparámetros para el árbol de decisión.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'max_depth': [2, 3],
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(DecisionTreeClassifier(class_weight="balanced", random_state=50), 
                    params, cv=5)

In [None]:
grid.fit(x_train, y_train)
print("Mejores parametros:", grid.best_params_)
print("Mejores scores: ", grid.best_score_)

Se crea el modelo optimizado.

In [None]:
optimized_model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=3,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=10,
    random_state=32
)

In [None]:
optimized_model.fit(x_train, y_train)

In [None]:
y_pred2 = optimized_model.predict(x_test)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred2))
disp.plot()
plt.show()

In [None]:
plt.figure(figsize=(20, 20))
tree.plot_tree(optimized_model, filled=True, class_names= ["0", "1", "2"], feature_names=x.columns)
plt.show()

In [None]:
f1_optimized = f1_score(y_test, y_pred2, average="macro")
precision_optimized = precision_score(y_test, y_pred2, average="macro")
recall_optimized = recall_score(y_test, y_pred2, average="macro")

print(f"precision: {precision_optimized}")
print(f"recall: {recall_optimized}")
print(f"F1 Score : {f1_optimized}")

In [None]:
print(classification_report(y_test, y_pred2, labels=["0", "1", "2"]))

Podemos obervar una mejora del modelo del árbol despues de la optimización.


### Comparación de métricas

In [None]:
metrics = ['F1 Score', 'Recall', 'Precision']
original_scores = [f1_original, recall_original, precision_original]
optimized_scores = [f1_optimized, recall_optimized, precision_optimized]

x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(8, 5))
plt.bar(x - width/2, original_scores, width, label='Modelo Original')
plt.bar(x + width/2, optimized_scores, width, label='Modelo Optimizado')

plt.ylabel('Valor')
plt.title('Comparación de métricas entre modelos')
plt.xticks(x, metrics)
plt.ylim(0, 1)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()