# üìä NeoScore - Exploratory Data Analysis (EDA)

**Autor**: Luca Camus  
**Fecha**: Enero 2026  
**Objetivo**: Explorar la tabla `customer_features` para entender patrones de riesgo crediticio

## 1. Configuraci√≥n

In [None]:
# Instalar dependencias
!pip install google-cloud-bigquery pandas matplotlib seaborn --quiet

In [None]:
# Autenticaci√≥n
from google.colab import auth
auth.authenticate_user()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Cliente BigQuery
PROJECT_ID = 'scoring-bancario'
client = bigquery.Client(project=PROJECT_ID)

print('‚úÖ Configuraci√≥n completa')

## 2. Cargar Datos desde BigQuery

In [None]:
# Cargar tabla customer_features
query = """
SELECT *
FROM `scoring-bancario.analisis_bancario.customer_features`
"""

df = client.query(query).to_dataframe()
print(f'üìä Dataset cargado: {df.shape[0]:,} clientes x {df.shape[1]} features')

In [None]:
# Vista previa
df.head()

In [None]:
# Informaci√≥n del dataset
df.info()

## 3. Estad√≠sticas Descriptivas

In [None]:
# Estad√≠sticas num√©ricas
df.describe().T

In [None]:
# Valores nulos por columna
null_counts = df.isnull().sum()
null_pct = (null_counts / len(df)) * 100
null_df = pd.DataFrame({'nulos': null_counts, 'porcentaje': null_pct})
null_df[null_df['nulos'] > 0].sort_values('porcentaje', ascending=False)

## 4. An√°lisis de la Variable Objetivo: `high_risk_flag`

In [None]:
# Distribuci√≥n de riesgo
risk_counts = df['high_risk_flag'].value_counts()
risk_pct = df['high_risk_flag'].value_counts(normalize=True) * 100

print('üìä Distribuci√≥n de Riesgo:')
print(f'   Bajo riesgo (0): {risk_counts[0]:,} ({risk_pct[0]:.2f}%)')
print(f'   Alto riesgo (1): {risk_counts[1]:,} ({risk_pct[1]:.2f}%)')

# Visualizaci√≥n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gr√°fico de barras
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(['Bajo Riesgo', 'Alto Riesgo'], risk_counts.values, color=colors)
axes[0].set_title('Distribuci√≥n de Clientes por Riesgo', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Cantidad de Clientes')
for i, v in enumerate(risk_counts.values):
    axes[0].text(i, v + 500, f'{v:,}', ha='center', fontweight='bold')

# Gr√°fico de pastel
axes[1].pie(risk_counts.values, labels=['Bajo Riesgo', 'Alto Riesgo'], 
            autopct='%1.1f%%', colors=colors, explode=[0, 0.05],
            shadow=True, startangle=90)
axes[1].set_title('Proporci√≥n de Riesgo', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. üéØ SEGMENTACI√ìN: High Risk vs Low Risk (CR√çTICO para modelado)

**Objetivo**: Verificar si hay diferencia visual clara entre grupos. Si la hay, el modelo funcionar√°.

In [None]:
# Comparaci√≥n de avg_balance entre High Risk y Low Risk
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Datos por grupo
low_risk = df[df['high_risk_flag'] == 0]
high_risk = df[df['high_risk_flag'] == 1]

# =============================================
# GR√ÅFICO 1: Balance Promedio por Grupo de Riesgo
# =============================================
balance_by_risk = df.groupby('high_risk_flag')['avg_balance'].mean()
colors = ['#2ecc71', '#e74c3c']
bars = axes[0].bar(['Bajo Riesgo', 'Alto Riesgo'], balance_by_risk.values, color=colors)
axes[0].set_title('Balance Promedio por Nivel de Riesgo', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Balance Promedio (INR)')
for i, bar in enumerate(bars):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000, 
                 f'{balance_by_risk.values[i]:,.0f}', ha='center', fontweight='bold', fontsize=12)

# A√±adir porcentaje de diferencia
diff_pct = ((balance_by_risk[0] - balance_by_risk[1]) / balance_by_risk[1]) * 100
axes[0].annotate(f'Diferencia: {diff_pct:.1f}%', xy=(0.5, 0.95), xycoords='axes fraction',
                 ha='center', fontsize=12, color='navy', fontweight='bold',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# =============================================
# GR√ÅFICO 2: Gasto Promedio por Grupo de Riesgo
# =============================================
spend_by_risk = df.groupby('high_risk_flag')['avg_spend'].mean()
bars = axes[1].bar(['Bajo Riesgo', 'Alto Riesgo'], spend_by_risk.values, color=colors)
axes[1].set_title('Gasto Promedio por Nivel de Riesgo', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Gasto Promedio (INR)')
for i, bar in enumerate(bars):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
                 f'{spend_by_risk.values[i]:,.0f}', ha='center', fontweight='bold', fontsize=12)

# =============================================
# GR√ÅFICO 3: Boxplot de Balance por Riesgo
# =============================================
df_clipped = df.copy()
df_clipped['avg_balance_clipped'] = df_clipped['avg_balance'].clip(upper=df_clipped['avg_balance'].quantile(0.95))
df_clipped.boxplot(column='avg_balance_clipped', by='high_risk_flag', ax=axes[2])
axes[2].set_title('Distribuci√≥n de Balance por Riesgo', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Alto Riesgo (0=No, 1=S√≠)')
axes[2].set_ylabel('Balance (INR) - Clipped 95%')
plt.suptitle('')

plt.tight_layout()
plt.show()

# Interpretaci√≥n
print('\nüìä INTERPRETACI√ìN:')
print(f'   ‚Ä¢ Balance promedio BAJO RIESGO: {balance_by_risk[0]:,.0f} INR')
print(f'   ‚Ä¢ Balance promedio ALTO RIESGO: {balance_by_risk[1]:,.0f} INR')
print(f'   ‚Ä¢ Diferencia: {diff_pct:.1f}% - {"‚úÖ HAY DIFERENCIA CLARA" if abs(diff_pct) > 20 else "‚ö†Ô∏è Diferencia moderada"}')
print(f'\n   ‚Üí Si hay diferencia visual clara, el modelo funcionar√°.')

In [None]:
# Comparaci√≥n completa de m√©tricas clave
print('\nüìä COMPARACI√ìN COMPLETA HIGH RISK vs LOW RISK:')
print('=' * 60)

metrics = ['avg_balance', 'avg_spend', 'total_transactions', 'spend_to_balance_ratio', 'age']
for metric in metrics:
    low_val = low_risk[metric].mean()
    high_val = high_risk[metric].mean()
    diff = ((low_val - high_val) / high_val * 100) if high_val != 0 else 0
    print(f'{metric:25} | Low Risk: {low_val:>12,.2f} | High Risk: {high_val:>12,.2f} | Diff: {diff:>+.1f}%')

print('=' * 60)

## 6. Distribuci√≥n del Credit Score

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histograma
axes[0].hist(df['preliminary_credit_score'], bins=20, color='steelblue', edgecolor='white')
axes[0].axvline(df['preliminary_credit_score'].mean(), color='red', linestyle='--', label=f'Media: {df["preliminary_credit_score"].mean():.1f}')
axes[0].axvline(df['preliminary_credit_score'].median(), color='orange', linestyle='--', label=f'Mediana: {df["preliminary_credit_score"].median():.1f}')
axes[0].set_title('Distribuci√≥n del Credit Score', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frecuencia')
axes[0].legend()

# Boxplot por riesgo
df.boxplot(column='preliminary_credit_score', by='high_risk_flag', ax=axes[1])
axes[1].set_title('Score por Nivel de Riesgo', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Alto Riesgo (0=No, 1=S√≠)')
axes[1].set_ylabel('Credit Score')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 7. An√°lisis Demogr√°fico

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribuci√≥n de edad
df_age = df[df['age'].notna() & (df['age'] > 0) & (df['age'] < 100)]
axes[0].hist(df_age['age'], bins=30, color='coral', edgecolor='white')
axes[0].axvline(df_age['age'].mean(), color='red', linestyle='--', label=f'Media: {df_age["age"].mean():.1f}')
axes[0].set_title('Distribuci√≥n de Edad', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Edad')
axes[0].set_ylabel('Frecuencia')
axes[0].legend()

# Verificar limpieza de edad
print(f'‚úÖ Edad promedio: {df_age["age"].mean():.1f} a√±os - Limpieza de fechas 1/1/1800 funcion√≥ correctamente')

# Distribuci√≥n de g√©nero
gender_counts = df['gender'].value_counts()
axes[1].bar(gender_counts.index.fillna('No especificado'), gender_counts.values, color=['#3498db', '#e91e63', '#95a5a6'])
axes[1].set_title('Distribuci√≥n por G√©nero', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Cantidad')
for i, v in enumerate(gender_counts.values):
    axes[1].text(i, v + 500, f'{v:,}', ha='center')

plt.tight_layout()
plt.show()

In [None]:
# Riesgo por g√©nero
risk_by_gender = df.groupby('gender')['high_risk_flag'].mean() * 100
print('üìä Tasa de Alto Riesgo por G√©nero:')
print(risk_by_gender.round(2))

## 8. An√°lisis de Variables Financieras

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Balance promedio
axes[0, 0].hist(df['avg_balance'].clip(upper=df['avg_balance'].quantile(0.95)), 
                bins=50, color='#2ecc71', edgecolor='white')
axes[0, 0].set_title('Distribuci√≥n de Balance Promedio', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Balance (INR)')

# Gasto promedio
axes[0, 1].hist(df['avg_spend'].clip(upper=df['avg_spend'].quantile(0.95)), 
                bins=50, color='#e74c3c', edgecolor='white')
axes[0, 1].set_title('Distribuci√≥n de Gasto Promedio', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Gasto (INR)')

# Total de transacciones
axes[1, 0].hist(df['total_transactions'].clip(upper=df['total_transactions'].quantile(0.95)), 
                bins=50, color='#3498db', edgecolor='white')
axes[1, 0].set_title('Distribuci√≥n de Total de Transacciones', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('N√∫mero de Transacciones')

# Ratio gasto/balance
df_ratio = df[df['spend_to_balance_ratio'].notna()]
axes[1, 1].hist(df_ratio['spend_to_balance_ratio'].clip(upper=1), 
                bins=50, color='#9b59b6', edgecolor='white')
axes[1, 1].axvline(1, color='red', linestyle='--', label='Ratio = 1 (Riesgo)')
axes[1, 1].set_title('Ratio Gasto/Balance', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Ratio')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 9. Matriz de Correlaci√≥n

In [None]:
# Seleccionar variables num√©ricas relevantes
numeric_cols = ['age', 'avg_balance', 'total_spend', 'avg_spend', 'std_spend',
                'total_transactions', 'days_active', 'transaction_frequency',
                'spend_to_balance_ratio', 'spend_volatility', 
                'high_risk_flag', 'preliminary_credit_score']

# Calcular correlaci√≥n
corr_matrix = df[numeric_cols].corr()

# Visualizar
plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlGn', center=0,
            fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
plt.title('Matriz de Correlaci√≥n de Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlaciones con high_risk_flag
corr_with_risk = corr_matrix['high_risk_flag'].sort_values(ascending=False)
print('üìä Correlaci√≥n con Alto Riesgo:')
print(corr_with_risk)

## 10. ‚ö†Ô∏è IMPORTANTE: Variables para Modelado (Evitar Leakage)

**DATA LEAKAGE**: NO usar `preliminary_credit_score` como variable de entrada para predecir `high_risk_flag`.

El score fue calculado usando la misma l√≥gica que define el riesgo, por lo que usarlo ser√≠a "hacer trampa".

In [None]:
# Variables PERMITIDAS para modelado (datos crudos)
FEATURES_PERMITIDAS = [
    'age',                      # Demograf√≠a
    'avg_balance',              # Balance
    'last_balance',
    'min_balance',
    'max_balance',
    'total_spend',              # Gasto
    'avg_spend',
    'max_spend',
    'min_spend',
    'std_spend',
    'total_transactions',       # Actividad
    'days_active',
    'unique_transaction_days',
    'transaction_frequency',
    'spend_to_balance_ratio',   # Ratios derivados
    'spend_volatility',
    'avg_daily_transactions',
    'avg_daily_spend',
]

# Variables PROHIBIDAS (causan leakage)
FEATURES_PROHIBIDAS = [
    'preliminary_credit_score',  # ‚ö†Ô∏è Calculado con la misma l√≥gica que high_risk_flag
    'high_risk_flag',            # ‚ö†Ô∏è Esta es la variable objetivo, no una feature
]

print('‚úÖ FEATURES PERMITIDAS PARA MODELADO:')
print(f'   Total: {len(FEATURES_PERMITIDAS)} variables')
for f in FEATURES_PERMITIDAS:
    print(f'   ‚Ä¢ {f}')

print('\n‚ùå FEATURES PROHIBIDAS (LEAKAGE):')
for f in FEATURES_PROHIBIDAS:
    print(f'   ‚ö†Ô∏è {f}')

## 11. Insights Clave üìù

In [None]:
print('=' * 60)
print('üìä RESUMEN DE INSIGHTS - NeoScore EDA')
print('=' * 60)

print(f'''
1. DISTRIBUCI√ìN DE RIESGO:
   - Clientes de alto riesgo: {(df["high_risk_flag"].mean()*100):.1f}%
   - Clientes de bajo riesgo: {((1-df["high_risk_flag"].mean())*100):.1f}%

2. CREDIT SCORE:
   - Score promedio: {df["preliminary_credit_score"].mean():.1f}
   - Score mediano: {df["preliminary_credit_score"].median():.1f}

3. DEMOGRAF√çA:
   - Edad promedio: {df["age"].mean():.1f} a√±os ‚úÖ (limpieza fechas OK)
   - Clientes masculinos: {(df["gender"].value_counts(normalize=True).get("M", 0)*100):.1f}%
   - Clientes femeninos: {(df["gender"].value_counts(normalize=True).get("F", 0)*100):.1f}%

4. COMPORTAMIENTO FINANCIERO:
   - Balance promedio: {df["avg_balance"].mean():,.0f} INR
   - Gasto promedio: {df["avg_spend"].mean():,.0f} INR
   - Transacciones promedio: {df["total_transactions"].mean():.1f}

5. SEGMENTACI√ìN (High vs Low Risk):
   - Diferencia en balance: {((df.groupby("high_risk_flag")["avg_balance"].mean()[0] - df.groupby("high_risk_flag")["avg_balance"].mean()[1]) / df.groupby("high_risk_flag")["avg_balance"].mean()[1] * 100):.1f}%
   ‚Üí {"‚úÖ HAY DIFERENCIA CLARA - El modelo funcionar√°" if abs((df.groupby("high_risk_flag")["avg_balance"].mean()[0] - df.groupby("high_risk_flag")["avg_balance"].mean()[1]) / df.groupby("high_risk_flag")["avg_balance"].mean()[1] * 100) > 20 else "‚ö†Ô∏è Diferencia moderada"}

6. FEATURES M√ÅS CORRELACIONADAS CON RIESGO:
''')

top_corr = corr_with_risk.drop('high_risk_flag').abs().sort_values(ascending=False).head(5)
for feat, val in top_corr.items():
    print(f'   - {feat}: {val:.3f}')

print('\n' + '=' * 60)

In [None]:
print('\nüéâ EDA completado!')
print('\nPr√≥ximos pasos:')
print('1. Crear notebook 04_modeling.ipynb para entrenamiento de modelos')
print('2. Usar SOLO las features permitidas (evitar leakage)')
print('3. Implementar Logistic Regression, Random Forest, XGBoost')
print('4. Evaluar con ROC-AUC, Gini, KS')