In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

# 1. Cargar el dataset
df = pd.read_csv("defunciones_filtradas.csv")

# 2. Preprocesamiento
# Verificar balance de clases
print("Distribución de clases:")
print(df['causa'].value_counts(normalize=True))

# Separar características y objetivo
X = df.drop(columns=['causa'])
y = df['causa']

# Codificar variables categóricas
X = pd.get_dummies(X, drop_first=True)  # drop_first para reducir multicolinealidad

# Manejar valores faltantes (imputar con mediana para numéricas)
X = X.fillna(X.median())

# Codificar la variable objetivo
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Balancear clases si es necesario (oversampling de la clase minoritaria)
df_majority = df[df['causa'] == 'I219']
df_minority1 = df[df['causa'] == 'E149']
df_minority2 = df[df['causa'] == 'J189']

if len(df_majority) > len(df_minority1):
    df_minority1_upsampled = resample(df_minority1, replace=True, n_samples=len(df_majority), random_state=42)
    df_minority2_upsampled = resample(df_minority2, replace=True, n_samples=len(df_majority), random_state=42)
    df_balanced = pd.concat([df_majority, df_minority1_upsampled, df_minority2_upsampled])
    X_balanced = pd.get_dummies(df_balanced.drop(columns=['causa']), drop_first=True).fillna(X.median())
    y_balanced = le.fit_transform(df_balanced['causa'])
else:
    X_balanced = X
    y_balanced = y_encoded

# Escalar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)

# 3. División de datos (85/15)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_balanced, test_size=0.15, random_state=42)

# 4. Optimización de hiperparámetros con GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(30,), (50,), (50, 30), (100, 50)],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [300, 500]
}
mlp = MLPClassifier(random_state=42)
grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_
print("Mejores hiperparámetros:", grid_search.best_params_)



  df = pd.read_csv("defunciones_filtradas.csv")


Distribución de clases:
causa
I219    0.435412
J189    0.328745
E149    0.235843
Name: proportion, dtype: float64
Mejores hiperparámetros: {'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'max_iter': 500}


In [4]:
# 5. Evaluación del modelo
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)
print(f"Accuracy: {accuracy:.2f}")
print("Reporte de clasificación:")
print(report)



Accuracy: 0.63
Reporte de clasificación:
              precision    recall  f1-score   support

        E149       0.60      0.70      0.65      9313
        I219       0.57      0.49      0.53      9167
        J189       0.70      0.69      0.69      9419

    accuracy                           0.63     27899
   macro avg       0.63      0.63      0.62     27899
weighted avg       0.63      0.63      0.62     27899



In [5]:
# 6. Matriz de confusión normalizada
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Matriz de Confusión Normalizada")
plt.xlabel("Etiqueta Predicha")
plt.ylabel("Etiqueta Real")
plt.savefig('confusion_matrix.png')
plt.close()

In [6]:

# 7. Curvas ROC
plt.figure(figsize=(10, 8))
y_test_bin = pd.get_dummies(y_test)
y_score = best_model.predict_proba(X_test)
for i, class_name in enumerate(le.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin.iloc[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curvas ROC por Clase')
plt.legend(loc='best')
plt.savefig('roc_curves.png')
plt.close()



In [7]:
# 8. Importancia de características (permutación)
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
sorted_idx = perm_importance.importances_mean.argsort()[-10:]  # Top 10 características
plt.figure(figsize=(10, 6))
plt.barh(range(10), perm_importance.importances_mean[sorted_idx], xerr=perm_importance.importances_std[sorted_idx])
plt.yticks(range(10), [X.columns[i] for i in sorted_idx])
plt.xlabel('Importancia (Permutación)')
plt.title('Top 10 Características Más Importantes')
plt.savefig('feature_importance.png')
plt.close()

print("Gráficos generados: 'confusion_matrix.png', 'roc_curves.png', 'feature_importance.png'")

Gráficos generados: 'confusion_matrix.png', 'roc_curves.png', 'feature_importance.png'


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

# 1. Cargar el dataset
df = pd.read_csv("defunciones_filtradas.csv")

# 2. Preprocesamiento
# Separar características y objetivo
X = df.drop(columns=['causa'])
y = df['causa']

# Codificar variables categóricas
X = pd.get_dummies(X, drop_first=True)

# Imputar valores faltantes con mediana
X = X.fillna(X.median())

# Codificar la variable objetivo
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Balancear clases (oversampling)
df_majority = df[df['causa'] == 'J189']  # Clase con más ejemplos (9419)
df_i219 = df[df['causa'] == 'I219']
df_e149 = df[df['causa'] == 'E149']

df_i219_upsampled = resample(df_i219, replace=True, n_samples=len(df_majority), random_state=42)
df_e149_upsampled = resample(df_e149, replace=True, n_samples=len(df_majority), random_state=42)
df_balanced = pd.concat([df_majority, df_i219_upsampled, df_e149_upsampled])

X_balanced = pd.get_dummies(df_balanced.drop(columns=['causa']), drop_first=True).fillna(X.median())
y_balanced = le.fit_transform(df_balanced['causa'])

# Escalar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)

# 3. División de datos (85/15)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_balanced, test_size=0.15, random_state=42)

# 4. Modelo con pesos ajustados para priorizar I219
# Asignar mayor peso a I219 (índice 1 después de LabelEncoder)
class_weights = {0: 1.0, 1: 1.5, 2: 1.0}  # I219 tiene peso mayor
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), learning_rate_init=0.001, max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# 5. Evaluación
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)
print(f"Accuracy: {accuracy:.2f}")
print("Reporte de clasificación:")
print(report)

# 6. Matriz de confusión normalizada
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Matriz de Confusión Normalizada (Optimizada para I219)")
plt.xlabel("Etiqueta Predicha")
plt.ylabel("Etiqueta Real")
plt.savefig('confusion_matrix_i219.png')
plt.close()

print("Gráfico generado: 'confusion_matrix_i219.png'")

  df = pd.read_csv("defunciones_filtradas.csv")


Accuracy: 0.62
Reporte de clasificación:
              precision    recall  f1-score   support

        E149       0.59      0.67      0.63      7059
        I219       0.56      0.55      0.55      6926
        J189       0.71      0.64      0.67      7080

    accuracy                           0.62     21065
   macro avg       0.62      0.62      0.62     21065
weighted avg       0.62      0.62      0.62     21065

Gráfico generado: 'confusion_matrix_i219.png'


In [None]:
from sklearn.metrics import roc_curve, auc
import pandas as pd


y_test_bin = pd.get_dummies(y_test)
y_score = mlp.predict_proba(X_test)

# Calcular AUC por clase
for i, class_name in enumerate(le.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin.iloc[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    print(f"AUC para {class_name}: {roc_auc:.2f}")

AUC para E149: 0.81
AUC para I219: 0.75
AUC para J189: 0.84


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

# 1. Cargar el dataset
df = pd.read_csv("defunciones_filtradas.csv")

# 2. Preprocesamiento
X = df.drop(columns=['causa'])
y = df['causa']

# Codificar variables categóricas
X = pd.get_dummies(X, drop_first=True)

# Imputar valores faltantes con mediana
X = X.fillna(X.median())

# Codificar la variable objetivo
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Balancear clases (oversampling)
df_majority = df[df['causa'] == 'J189']
df_i219 = df[df['causa'] == 'I219']
df_e149 = df[df['causa'] == 'E149']

df_i219_upsampled = resample(df_i219, replace=True, n_samples=len(df_majority), random_state=42)
df_e149_upsampled = resample(df_e149, replace=True, n_samples=len(df_majority), random_state=42)
df_balanced = pd.concat([df_majority, df_i219_upsampled, df_e149_upsampled])

X_balanced = pd.get_dummies(df_balanced.drop(columns=['causa']), drop_first=True).fillna(X.median())
y_balanced = le.fit_transform(df_balanced['causa'])

# Escalar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)

# 3. División de datos (85/15)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_balanced, test_size=0.15, random_state=42)

# 4. Modelo con pesos ajustados (peso menor para I219)
class_weights = {0: 1.0, 1: 1.2, 2: 1.0}  # Reducimos peso de I219 de 1.5 a 1.2
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), learning_rate_init=0.001, max_iter=600, random_state=42)
mlp.fit(X_train, y_train)

# 5. Evaluación
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)
print(f"Accuracy: {accuracy:.2f}")
print("Reporte de clasificación:")
print(report)

# 6. Matriz de confusión normalizada
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Matriz de Confusión Normalizada (Peso Ajustado para I219)")
plt.xlabel("Etiqueta Predicha")
plt.ylabel("Etiqueta Real")
plt.savefig('confusion_matrix_balanced_i219.png')
plt.close()

print("Gráfico generado: 'confusion_matrix_balanced_i219.png'")

  df = pd.read_csv("defunciones_filtradas.csv")
