## IMPORTANCIA DE CARACTERISTICAS DEL MODELO LIGHTGBM

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import (roc_curve, auc, roc_auc_score, precision_score, 
                          recall_score, f1_score, accuracy_score, confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import json

# Cargar datos
df_mora_prestamos = pd.read_csv('D:/Tesis/Definitivos/nuevo/df_mora_prestamospersonales.csv')
X = df_mora_prestamos.drop('Morosidad', axis=1)
y = df_mora_prestamos['Morosidad']

# División de datos
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.3, stratify=y, random_state=42
)

# Configurar y entrenar modelo con los mejores hiperparámetros
model = lgb.LGBMClassifier(
   subsample=0.6,
   reg_lambda=0.1,
   reg_alpha=0.1,
   num_leaves=127,
   n_estimators=900,
   min_child_samples=10,
   max_depth=-1,
   learning_rate=0.05,
   colsample_bytree=0.6,
   random_state=42
)

print("Entrenando modelo LightGBM...")
model.fit(X_train, y_train)

# Realizar predicciones
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calcular métricas
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

metrics = {
   'ROC_AUC': roc_auc_score(y_test, y_pred_proba),
   'Precision': precision_score(y_test, y_pred),
   'Recall': recall_score(y_test, y_pred),
   'Specificity': specificity,
   'F1_Score': f1_score(y_test, y_pred),
   'Accuracy': accuracy_score(y_test, y_pred)
}

# Calcular datos para curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Crear directorio para resultados
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
result_dir = f'D:/Tesis/Definitivos/nuevo/lightgbm_analysis_results_{timestamp}'
os.makedirs(result_dir, exist_ok=True)

# Análisis de importancia de características
feature_importance = pd.DataFrame({
   'feature': X.columns,
   'importance': model.feature_importances_
})
feature_importance['importance_percentage'] = (feature_importance['importance'] / 
                                           feature_importance['importance'].sum()) * 100
feature_importance = feature_importance.sort_values('importance', ascending=False)

# 1. Graficar importancia de las top 20 características
top_20_features = feature_importance.head(20)
plt.figure(figsize=(15, 10))
sns.barplot(x='importance_percentage', y='feature', data=top_20_features)
plt.title('Top 20 Características más Importantes - LightGBM', fontsize=14)
plt.xlabel('Importancia (%)', fontsize=12)
plt.ylabel('Característica', fontsize=12)
plt.tight_layout()
plt.savefig(f'{result_dir}/feature_importance_top20.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Graficar matriz de confusión
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de Confusión - LightGBM', fontsize=14)
plt.ylabel('Real', fontsize=12)
plt.xlabel('Predicho', fontsize=12)
plt.savefig(f'{result_dir}/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Graficar curva ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, 'b-', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos', fontsize=12)
plt.ylabel('Tasa de Verdaderos Positivos', fontsize=12)
plt.title('Curva ROC - LightGBM', fontsize=14)
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig(f'{result_dir}/roc_curve.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Graficar Recall y Especificidad
plt.figure(figsize=(8, 6))
metrics_plot = [metrics['Recall'], metrics['Specificity']]
plt.bar(['Recall (Sensibilidad)', 'Especificidad'], metrics_plot, 
       color=['skyblue', 'lightgreen'])
plt.title('Recall vs Especificidad - LightGBM', fontsize=14)
plt.ylim([0, 1])
for i, v in enumerate(metrics_plot):
   plt.text(i, v + 0.01, f'{v:.4f}', ha='center')
plt.savefig(f'{result_dir}/recall_specificity.png', dpi=300, bbox_inches='tight')
plt.close()

# Guardar todos los resultados
results = {
   'model_name': 'LightGBM',
   'hyperparameters': {
       'subsample': 0.6,
       'reg_lambda': 0.1,
       'reg_alpha': 0.1,
       'num_leaves': 127,
       'n_estimators': 900,
       'min_child_samples': 10,
       'max_depth': -1,
       'learning_rate': 0.05,
       'colsample_bytree': 0.6
   },
   'metrics': metrics,
   'confusion_matrix': cm.tolist(),
   'feature_importance': {
       'features': feature_importance['feature'].tolist(),
       'importance_scores': feature_importance['importance'].tolist(),
       'importance_percentages': feature_importance['importance_percentage'].tolist()
   }
}

# Guardar resultados
with open(f'{result_dir}/model_results.json', 'w') as f:
   json.dump(results, f, indent=4)

feature_importance.to_csv(f'{result_dir}/feature_importance.csv', index=False)

# Imprimir resultados
print("\nResultados del modelo LightGBM:")
print("="*40)

print("\nMétricas principales:")
for metric, value in metrics.items():
   print(f"{metric}: {value:.4f}")

print("\nMatriz de Confusión:")
print(cm)

print("\nImportancia de características (ordenadas de mayor a menor):")
print("="*80)
print(feature_importance[['feature', 'importance_percentage']].to_string(
   formatters={'importance_percentage': '{:.4f}%'.format}
))

print(f"\nResultados guardados en: {result_dir}")

Entrenando modelo LightGBM...
[LightGBM] [Info] Number of positive: 65670, number of negative: 1072714
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points in the train set: 1138384, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057687 -> initscore=-2.793305
[LightGBM] [Info] Start training from score -2.793305

Resultados del modelo LightGBM:

Métricas principales:
ROC_AUC: 0.9998
Precision: 0.9921
Recall: 0.9787
Specificity: 0.9995
F1_Score: 0.9853
Accuracy: 0.9983

Matriz de Confusión:
[[459515    220]
 [   600  27545]]

Importancia de características (ordenadas de mayor a menor):
                               feature importance_percentage
26         Deuda_Sist_Financiero_tukey               9.2901%
25      

## IMPORTANCIA DE CARACTERISTICAS MEDIANTE LA APLICACION DE LA TECNICA SHAP

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
from datetime import datetime
import os
import json

# Cargar datos
df_mora_prestamos = pd.read_csv('D:/Tesis/Definitivos/nuevo/df_mora_prestamospersonales.csv')
X = df_mora_prestamos.drop('Morosidad', axis=1)
y = df_mora_prestamos['Morosidad']

# División de datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Configurar y entrenar modelo
model = lgb.LGBMClassifier(
    subsample=0.6,
    reg_lambda=0.1,
    reg_alpha=0.1,
    num_leaves=127,
    n_estimators=900,
    min_child_samples=10,
    max_depth=-1,
    learning_rate=0.05,
    colsample_bytree=0.6,
    random_state=42
)


print("Entrenando modelo LightGBM...")
model.fit(X_train, y_train)

# Crear directorio para resultados
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
result_dir = f'D:/Tesis/Definitivos/nuevo/lightgbm_shap_analysis_{timestamp}'
os.makedirs(result_dir, exist_ok=True)

# Calcular valores SHAP
print("Calculando valores SHAP...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# 1. Summary Plot
plt.figure(figsize=(12, len(X.columns) * 0.3))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
plt.title("Importancia de Variables (SHAP)", pad=20)
plt.tight_layout()
plt.savefig(f'{result_dir}/shap_importance_bar.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Summary Plot (Beeswarm)
plt.figure(figsize=(12, len(X.columns) * 0.3))
shap.summary_plot(shap_values, X, show=False)
plt.title("Impacto de Variables en la Predicción (SHAP)", pad=20)
plt.tight_layout()
plt.savefig(f'{result_dir}/shap_importance_beeswarm.png', dpi=300, bbox_inches='tight')
plt.close()

# Calcular importancia media absoluta SHAP
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values).mean(0),
    'importance_percentage': 100 * np.abs(shap_values).mean(0) / np.abs(shap_values).mean(0).sum()
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Guardar resultados
feature_importance.to_csv(f'{result_dir}/shap_feature_importance.csv', index=False)

# Guardar valores SHAP para análisis posterior
shap_values_df = pd.DataFrame(shap_values, columns=X.columns)
shap_values_df.to_csv(f'{result_dir}/shap_values.csv', index=False)

# Crear gráfico de dependencia para las top 5 variables
print("Generando gráficos de dependencia para las variables más importantes...")
top_features = feature_importance['feature'].head(5).tolist()
for feature in top_features:
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(feature, shap_values, X, show=False)
    plt.title(f'Gráfico de Dependencia - {feature}')
    plt.tight_layout()
    plt.savefig(f'{result_dir}/dependence_plot_{feature}.png', dpi=300, bbox_inches='tight')
    plt.close()

# Imprimir resultados
print("\nImportancia de variables según SHAP (ordenadas de mayor a menor):")
print("="*80)
print(feature_importance[['feature', 'importance_percentage']].to_string(
    formatters={'importance_percentage': '{:.4f}%'.format}
))

# Guardar resultados en JSON
results = {
    'feature_importance': {
        'features': feature_importance['feature'].tolist(),
        'importance_scores': feature_importance['importance'].tolist(),
        'importance_percentages': feature_importance['importance_percentage'].tolist()
    },
    'top_features': top_features
}

with open(f'{result_dir}/shap_analysis_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print(f"\nResultados guardados en: {result_dir}")
print("\nVisualizaciones generadas:")
print("1. shap_importance_bar.png - Importancia general de variables")
print("2. shap_importance_beeswarm.png - Distribución del impacto de variables")
print("3. Gráficos de dependencia para las top 5 variables")

Entrenando modelo LightGBM...
[LightGBM] [Info] Number of positive: 65670, number of negative: 1072714
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points in the train set: 1138384, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057687 -> initscore=-2.793305
[LightGBM] [Info] Start training from score -2.793305
Calculando valores SHAP...




Generando gráficos de dependencia para las variables más importantes...

Importancia de variables según SHAP (ordenadas de mayor a menor):
                               feature importance_percentage
15                  SituacionPonderada              23.6216%
13  Plazo_Promedio_PrestamosPersonales              15.2884%
12    Plazo_Maximo_PrestamosPersonales               8.4148%
27        Acreditaciones_sueldos_tukey               5.4989%
14                   Sit_Maxima_CENDEU               4.9957%
18                 TasaBADLAR_Promedio               4.1462%
22             CA_acreditaciones_tukey               4.0009%
23            CA_SaldoPromedio_$_tukey               3.9327%
25                    TotalDeuda_tukey               3.3681%
21                      Tasa_Ponderada               3.2010%
24   Importe_PrestamosPersonales_tukey               3.0062%
19                           Localidad               2.5345%
26         Deuda_Sist_Financiero_tukey               2.4598%
10     

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [8]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import json

# Cargar datos
df_mora_prestamos = pd.read_csv('D:/Tesis/Definitivos/nuevo/df_mora_prestamospersonales.csv')
X = df_mora_prestamos.drop('Morosidad', axis=1)
y = df_mora_prestamos['Morosidad']

# División de datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Configurar y entrenar modelo
model = lgb.LGBMClassifier(
    subsample=0.6,
    reg_lambda=0.1,
    reg_alpha=0.1,
    num_leaves=127,
    n_estimators=900,
    min_child_samples=10,
    max_depth=-1,
    learning_rate=0.05,
    colsample_bytree=0.6,
    random_state=42
)

print("Entrenando modelo LightGBM...")
model.fit(X_train, y_train)

# Crear directorio para resultados
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
result_dir = f'D:/Tesis/Definitivos/nuevo/lightgbm_analysis_{timestamp}'
os.makedirs(result_dir, exist_ok=True)

# Calcular valores SHAP
print("Calculando valores SHAP...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# 1. Summary Plot
plt.figure(figsize=(12, len(X.columns) * 0.3))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
plt.title("Importancia de Variables (SHAP)", pad=20)
plt.tight_layout()
plt.savefig(f'{result_dir}/shap_importance_bar.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Summary Plot (Beeswarm)
plt.figure(figsize=(12, len(X.columns) * 0.3))
shap.summary_plot(shap_values, X, show=False)
plt.title("Impacto de Variables en la Predicción (SHAP)", pad=20)
plt.tight_layout()
plt.savefig(f'{result_dir}/shap_importance_beeswarm.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Análisis específico de variables seleccionadas
variables_interes = ['TasaBADLAR_Promedio', 'InflaciónMensual', 'RemuneracionPromedio']

# Gráficos de dependencia SHAP para variables específicas
for variable in variables_interes:
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(variable, shap_values, X, show=False)
    plt.title(f'Gráfico de Dependencia SHAP - {variable}')
    plt.tight_layout()
    plt.savefig(f'{result_dir}/dependence_plot_{variable}.png', dpi=300, bbox_inches='tight')
    plt.close()

# Análisis de correlación con morosidad
df_analisis = df_mora_prestamos[variables_interes + ['Morosidad']]

# 4. Gráficos de dispersión para cada variable vs Morosidad
for variable in variables_interes:
    plt.figure(figsize=(10, 6))
    
    # Crear gráfico de dispersión
    sns.scatterplot(data=df_mora_prestamos, x=variable, y='Morosidad', alpha=0.5)
    
    # Agregar línea de tendencia
    z = np.polyfit(df_mora_prestamos[variable], df_mora_prestamos['Morosidad'], 1)
    p = np.poly1d(z)
    plt.plot(df_mora_prestamos[variable], p(df_mora_prestamos[variable]), "r--", alpha=0.8)
    
    plt.title(f'Relación entre {variable} y Morosidad')
    plt.xlabel(variable)
    plt.ylabel('Morosidad')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{result_dir}/scatter_plot_{variable}.png', dpi=300, bbox_inches='tight')
    plt.close()

# 5. Matriz de correlación
plt.figure(figsize=(10, 8))
correlation_matrix = df_analisis.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Matriz de Correlación')
plt.tight_layout()
plt.savefig(f'{result_dir}/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# 6. Gráfico de líneas temporales (si hay variable temporal)
if 'Fecha' in df_mora_prestamos.columns:
    plt.figure(figsize=(15, 10))
    
    # Crear subplots para cada variable
    fig, axes = plt.subplots(4, 1, figsize=(15, 20))
    fig.suptitle('Evolución Temporal de Variables', fontsize=16)
    
    # Graficar cada variable
    for idx, variable in enumerate(variables_interes):
        axes[idx].plot(df_mora_prestamos['Fecha'], df_mora_prestamos[variable], 'b-')
        axes[idx].set_title(f'Evolución de {variable}')
        axes[idx].grid(True)
        
    # Graficar morosidad
    axes[3].plot(df_mora_prestamos['Fecha'], df_mora_prestamos['Morosidad'], 'r-')
    axes[3].set_title('Evolución de Morosidad')
    axes[3].grid(True)
    
    plt.tight_layout()
    plt.savefig(f'{result_dir}/temporal_evolution.png', dpi=300, bbox_inches='tight')
    plt.close()

# Calcular y guardar estadísticas
stats = {
    'correlations': correlation_matrix['Morosidad'].to_dict(),
    'variables_stats': df_analisis.describe().to_dict()
}

with open(f'{result_dir}/analysis_stats.json', 'w') as f:
    json.dump(stats, f, indent=4)

# Imprimir resultados
print("\nCorrelaciones con Morosidad:")
print(correlation_matrix['Morosidad'].sort_values(ascending=False))

print("\nResultados guardados en:", result_dir)
print("\nArchivos generados:")
print("1. Gráficos SHAP")
print("2. Gráficos de dispersión")
print("3. Matriz de correlación")
print("4. Gráficos de evolución temporal (si aplica)")
print("5. Estadísticas descriptivas")


Entrenando modelo LightGBM...
[LightGBM] [Info] Number of positive: 65670, number of negative: 1072714
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points in the train set: 1138384, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057687 -> initscore=-2.793305
[LightGBM] [Info] Start training from score -2.793305
Calculando valores SHAP...





Correlaciones con Morosidad:
Morosidad               1.000000
RemuneracionPromedio    0.024374
TasaBADLAR_Promedio     0.023638
InflaciónMensual        0.018184
Name: Morosidad, dtype: float64

Resultados guardados en: D:/Tesis/Definitivos/nuevo/lightgbm_analysis_20250206_000038

Archivos generados:
1. Gráficos SHAP
2. Gráficos de dispersión
3. Matriz de correlación
4. Gráficos de evolución temporal (si aplica)
5. Estadísticas descriptivas


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>