In [None]:
# Enhanced Decision Tree with Friendly Visualization
# Based on diabetes prediction dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

# Set style for better looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

################################# DATA HANDLING ####################################################
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)

# Create more readable feature names for visualization
feature_mapping = {
    'pregnant': 'Pregnancies',
    'insulin': 'Insulin Level',
    'bmi': 'BMI',
    'age': 'Age',
    'glucose': 'Glucose Level',
    'bp': 'Blood Pressure',
    'pedigree': 'Diabetes Pedigree'
}

feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols]
y = pima.label

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:

################################# MACHINE LEARNING PART ####################################################
# Create a simpler tree for better visualization (limit depth)
clf = DecisionTreeClassifier() # <===== OPTIMIZAR AQUI, Incluir los argumentos "criterion" y "max_depth" de acuerdo a la documentacion en https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
clf = clf.fit(X_train, y_train)

# Predictions and accuracy
y_pred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2%}")


In [None]:

#################################  VISUALIZATION ####################################################

# 1. Beautiful Decision Tree Plot
plt.figure(figsize=(20, 12))
plot_tree(clf, 
          feature_names=[feature_mapping[col] for col in feature_cols],
          class_names=['No Diabetes', 'Has Diabetes'],
          filled=True,
          rounded=True,
          fontsize=12,
          proportion=True,
          impurity=False)

plt.title('🌳 Diabetes Prediction Decision Tree\n'
          f'Model Accuracy: {accuracy:.1%}', 
          fontsize=20, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('friendly_decision_tree.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:

# 2. Model Performance Visualization
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Diabetes', 'Has Diabetes'],
            yticklabels=['No Diabetes', 'Has Diabetes'],
            cbar_kws={'label': 'Count'})
plt.title('🎯 Model Performance: Confusion Matrix', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Predicted', fontsize=12, fontweight='bold')
plt.ylabel('Actual', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:

# 3. Summary Statistics
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"Accuracy: {accuracy:.1%}")
print(f"Tree Depth: {clf.tree_.max_depth}")
print(f"Number of Leaves: {clf.tree_.n_leaves}")
print(f"Total Samples: {len(X)}")
print(f"Test Samples: {len(X_test)}")

# 4. Classification report
print("\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Diabetes', 'Has Diabetes']))