<a href="https://colab.research.google.com/github/masags17/Python/blob/main/heart_cleveland_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Dataset/")
!pip install --upgrade scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.tree import plot_tree


# Regresión Lineal con winequality-red.csv
data = pd.read_csv('heart_cleveland_upload.csv')

# 1. Análisis Exploratorio de Datos
print("--- Análisis Exploratorio de Datos ---")
print("Resumen estadístico:")
print(data.describe())

print("\nMatriz de correlación:")
print(data.corr())

data.hist(bins=20, figsize=(10,10))
plt.tight_layout()
plt.show()

# 2. Preprocesamiento de Datos
print("--- Preprocesamiento de Datos ---")
X = data.drop('condition', axis=1)
y = data['condition']
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 3. Selección de Características
print("--- Selección de Características ---")
selector = SelectKBest(f_regression, k=5)
X = selector.fit_transform(X, y)

# 4. División en Train y Test
print("--- División en Train y Test ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Entrenamiento del Modelo
print("--- Entrenamiento del Modelo ---")
reg = LinearRegression()
reg.fit(X_train, y_train)

# 6. Evaluación del Modelo
print("--- Evaluación del Modelo ---")
y_pred = reg.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.3f}")
print(f"R^2: {r2_score(y_test, y_pred):.3f}")

# 7. Gráficas de Resultados
print("--- Gráficas de Resultados ---")
plt.figure(figsize=(5,5))
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
plt.tight_layout()
plt.show()

coefs = pd.Series(reg.coef_, index=selector.get_feature_names_out())
coefs.plot.bar(figsize=(8,5))
plt.tight_layout()
plt.show()

# 8. Interpretación y Análisis de Resultados
print("--- Interpretación y Análisis de Resultados ---")
print("Las variables más importantes son:")
print(coefs.abs().nlargest(3))

# Regresión Logística con heart_cleveland_upload.csv
data = pd.read_csv('heart_cleveland_upload.csv')

# 1. Análisis Exploratorio de Datos
print("--- Análisis Exploratorio de Datos ---")
print("Resumen estadístico:")
print(data.describe())

data.hist(bins=20, figsize=(10,10))
plt.tight_layout()
plt.show()

# 2. Preprocesamiento de Datos
print("--- Preprocesamiento de Datos ---")
X = data.drop('condition', axis=1)
y = data['condition']
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 3. Selección de Características
print("--- Selección de Características ---")
selector = SelectKBest(f_classif, k=5)
X = selector.fit_transform(X, y)

# 4. División en Train y Test
print("--- División en Train y Test ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Entrenamiento del Modelo
print("--- Entrenamiento del Modelo ---")
clf = LogisticRegression(C=1.0)
clf.fit(X_train, y_train)

# 6. Evaluación del Modelo
print("--- Evaluación del Modelo ---")
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred):.3f}")
print(f"AUC: {roc_auc_score(y_test, y_pred):.3f}")

# 7. Gráficas de Resultados
print("--- Gráficas de Resultados ---")
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Blues)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

prob_pos = clf.predict_proba(X_test)[:, 1]

n_bins = 10
bin_edges = np.linspace(0., 1., n_bins + 1)
bin_indices = np.digitize(prob_pos, bin_edges) - 1

bin_sums = np.bincount(bin_indices, weights=prob_pos, minlength=len(bin_edges))
bin_true = np.bincount(bin_indices, weights=y_test, minlength=len(bin_edges))
bin_total = np.bincount(bin_indices, minlength=len(bin_edges))

nonzero = bin_total != 0
prob_true = bin_true[nonzero] / bin_total[nonzero]
prob_pred = bin_sums[nonzero] / bin_total[nonzero]

plt.figure()
plt.plot(prob_pred, prob_true, "s-")
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel("Predicted probability")
plt.ylabel("Empirical probability")
plt.ylim([-0.05, 1.05])
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

# 8. Interpretación y Análisis de Resultados
print("--- Interpretación y Análisis de Resultados ---")
print("Las variables más importantes son:")
print(pd.Series(clf.coef_[0], index=selector.get_feature_names_out()).abs().nlargest(3))