# 04_EXPLORE_VINTAGE_CURVES.ipynb
================================

Objetivo: Explorar y validar el datamart de VINTAGE CURVES
- ¬øCu√°l es la estructura de datos de cohortes?
- ¬øCu√°nto tarda cada cohort en pagar su pr√©stamo?
- ¬øC√≥mo evoluciona el recovery rate por mes?
- ¬øHay diferencias por banda de riesgo?
- ¬øCu√°l es el patr√≥n de evoluci√≥n de ingresos?

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

In [None]:
# Cargar datos desde exports
base_path = Path.cwd().parent.parent if 'data_validation' in str(Path.cwd()) else Path.cwd()
data_path = base_path / 'data' / 'exports'

print(f"üìÅ Working dir: {Path.cwd()}")
print(f"üìÇ Base path: {base_path}")
print(f"üìç Data path: {data_path}")

# Cargar vintage curves
vintage_curves = pd.read_excel(data_path / 'vintage_curves.xlsx')

print(f"\n‚úÖ Vintage Curves cargadas: {len(vintage_curves):,} registros")

In [None]:
print("="*80)
print("üìä DATASET: VINTAGE CURVES (Cohort Analysis)")
print("="*80)

# 1. ¬øCu√°ntos registros hay?
print(f"\n1Ô∏è‚É£ Total de registros: {len(vintage_curves):,}")
print(f"   (Cada registro = 1 snapshot de cohort a cierta edad en meses)")

# 2. ¬øQu√© columnas tengo?
print(f"\n2Ô∏è‚É£ Columnas disponibles ({len(vintage_curves.columns)}):")
for i, col in enumerate(vintage_curves.columns, 1):
    print(f"   {i:2d}. {col}")

# 3. Ver primeros registros
print(f"\n3Ô∏è‚É£ Primeros 10 registros:")
display(vintage_curves.head(10))

# 4. Tipos de datos
print(f"\n4Ô∏è‚É£ Tipos de datos:")
print(vintage_curves.dtypes)

# 5. Valores nulos
print(f"\n5Ô∏è‚É£ Valores nulos:")
nulls = vintage_curves.isnull().sum()
nulls_pct = (nulls / len(vintage_curves) * 100).round(2)
null_df = pd.DataFrame({'Nulls': nulls, 'Percentage': nulls_pct})
print(null_df[null_df['Nulls'] > 0])
if null_df[null_df['Nulls'] > 0].empty:
    print("   ‚úÖ Sin valores nulos")

In [None]:
# 6. Estructura de COHORTES
print(f"\n6Ô∏è‚É£ ESTRUCTURA DE COHORTES:")

if 'vintage_month' in vintage_curves.columns:
    n_vintage_months = vintage_curves['vintage_month'].nunique()
    print(f"   - Meses de vintage: {n_vintage_months}")
    print(f"   Vintage months disponibles:")
    print(vintage_curves['vintage_month'].unique())
else:
    print("   ‚ö†Ô∏è No hay columna 'vintage_month'")

# Encontrar columna de edad/meses desde originaci√≥n
age_cols = [col for col in vintage_curves.columns if any(x in col.lower() for x in ['month', 'age', 'days'])]
print(f"\n   - Posibles columnas de edad: {age_cols}")

if age_cols:
    age_col = [c for c in age_cols if 'since' in c.lower() or 'origination' in c.lower()]
    if not age_col:
        age_col = [age_cols[0]]
    age_col = age_col[0]
    
    print(f"\n   Usando columna de edad: '{age_col}'")
    n_ages = vintage_curves[age_col].nunique()
    max_age = vintage_curves[age_col].max()
    print(f"   - Edades √∫nicas: {n_ages}")
    print(f"   - Edad m√°xima (meses): {max_age}")
    print(f"   - Rango de edades: {sorted(vintage_curves[age_col].unique())}")

# 7. Segmentos de riesgo
risk_cols = [col for col in vintage_curves.columns if 'risk' in col.lower() or 'segment' in col.lower()]
if risk_cols:
    risk_col = risk_cols[0]
    print(f"\n   - Columna de riesgo: '{risk_col}'")
    print(f"   Segmentos disponibles:")
    print(vintage_curves[risk_col].unique())
else:
    print(f"\n   ‚ö†Ô∏è No hay columna de riesgo/segmento")

In [None]:
# 8. EJEMPLO: Seguir una cohort espec√≠fica
print(f"\n8Ô∏è‚É£ EJEMPLO: Seguimiento de una COHORT espec√≠fica")

if 'vintage_month' in vintage_curves.columns and age_col:
    # Tomar la primera cohort
    first_vintage = sorted(vintage_curves['vintage_month'].unique())[0]
    cohort_data = vintage_curves[vintage_curves['vintage_month'] == first_vintage].sort_values(age_col)
    
    print(f"\n   Cohort: {first_vintage}")
    print(f"   Registros para esta cohort: {len(cohort_data)}")
    
    # Mostrar la evoluci√≥n
    cols_to_show = [col for col in cohort_data.columns if any(x in col.lower() for x in ['age', 'month', 'since', 'principal', 'revenue', 'recovery', 'default'])]
    if not cols_to_show:
        cols_to_show = cohort_data.columns[:10]
    
    print(f"\n   Evoluci√≥n de la cohort a trav√©s del tiempo:")
    display(cohort_data[cols_to_show].head(10))
    print(f"\n   üí° Interpretaci√≥n:")
    print(f"   - Cada fila = snapshot de la cohort en un mes espec√≠fico")
    print(f"   - Las m√©tricas muestran evoluci√≥n acumulada desde originaci√≥n")
    print(f"   - Permite ver c√≥mo diferentes cohortes se desempe√±an")

In [None]:
# 9. METRICAS DE PERFORMANCE
print(f"\n9Ô∏è‚É£ METRICAS DE PERFORMANCE ACUMULADAS:")

# Buscar columnas de recovery/default
recovery_cols = [col for col in vintage_curves.columns if 'recovery' in col.lower() or 'principal' in col.lower()]
default_cols = [col for col in vintage_curves.columns if 'default' in col.lower()]
revenue_cols = [col for col in vintage_curves.columns if 'revenue' in col.lower()]

print(f"   - Columnas de recovery/principal: {recovery_cols}")
print(f"   - Columnas de default: {default_cols}")
print(f"   - Columnas de revenue: {revenue_cols}")

if recovery_cols:
    recovery_col = recovery_cols[0]
    print(f"\n   üìä {recovery_col}:")
    print(f"   - M√≠nimo: {vintage_curves[recovery_col].min():.4f}")
    print(f"   - M√°ximo: {vintage_curves[recovery_col].max():.4f}")
    print(f"   - Promedio: {vintage_curves[recovery_col].mean():.4f}")

if default_cols:
    default_col = default_cols[0]
    print(f"\n   üìä {default_col}:")
    print(f"   - M√≠nimo: {vintage_curves[default_col].min():.4f}")
    print(f"   - M√°ximo: {vintage_curves[default_col].max():.4f}")
    print(f"   - Promedio: {vintage_curves[default_col].mean():.4f}")

if revenue_cols:
    revenue_col = revenue_cols[0]
    print(f"\n   üìä {revenue_col}:")
    print(f"   - Total: ${vintage_curves[revenue_col].sum():,.2f}")
    print(f"   - Promedio: ${vintage_curves[revenue_col].mean():,.2f}")

In [None]:
# 10. COMPARACI√ìN POR BANDA DE RIESGO
print(f"\nüîü PERFORMANCE POR BANDA DE RIESGO:")

if risk_col and recovery_cols:
    # Para cada banda de riesgo, ver stats agregadas
    risk_summary = vintage_curves.groupby(risk_col).agg({
        recovery_col: ['min', 'max', 'mean'],
        'total_loans_in_cohort': 'sum' if 'total_loans_in_cohort' in vintage_curves.columns else 'count'
    })
    print(risk_summary)
else:
    print("   ‚ö†Ô∏è Falta informaci√≥n de riesgo o recovery")

In [None]:
# 11. ANALISIS DE VINTAGE (Cohorte m√°s nueva vs m√°s vieja)
print(f"\n1Ô∏è‚É£1Ô∏è‚É£ VINTAGE M√ÅS NUEVA vs M√ÅS VIEJA:")

if 'vintage_month' in vintage_curves.columns:
    vintages = sorted(vintage_curves['vintage_month'].unique())
    oldest_vintage = vintages[0]
    newest_vintage = vintages[-1]
    
    print(f"\n   Vintage M√ÅS VIEJA: {oldest_vintage}")
    oldest_data = vintage_curves[vintage_curves['vintage_month'] == oldest_vintage]
    print(f"   - Registros: {len(oldest_data)}")
    print(f"   - Max age: {oldest_data[age_col].max()} meses")
    if recovery_cols:
        print(f"   - Recovery rate final: {oldest_data[recovery_col].max():.4f}")
    
    print(f"\n   Vintage M√ÅS NUEVA: {newest_vintage}")
    newest_data = vintage_curves[vintage_curves['vintage_month'] == newest_vintage]
    print(f"   - Registros: {len(newest_data)}")
    print(f"   - Max age: {newest_data[age_col].max()} meses")
    if recovery_cols:
        print(f"   - Recovery rate final: {newest_data[recovery_col].max():.4f}")

In [None]:
# 12. DISTRIBUCI√ìN DE EDADES
print(f"\n1Ô∏è‚É£2Ô∏è‚É£ DISTRIBUCI√ìN DE EDADES EN DATASET:")

if age_col:
    age_distribution = vintage_curves[age_col].value_counts().sort_index()
    print(f"   Registros por edad (meses):")
    print(age_distribution)
    
    print(f"\n   üìä Estructura:")
    print(f"   - Edades m√≠nima: {age_distribution.index.min()} meses")
    print(f"   - Edades m√°xima: {age_distribution.index.max()} meses")
    print(f"   - Registros por edad: {age_distribution.mean():.1f} (promedio)")

In [None]:
print("\n" + "="*80)
print("‚úÖ CONCLUSIONES - VINTAGE CURVES")
print("="*80)

if 'vintage_month' in vintage_curves.columns:
    n_vintages = vintage_curves['vintage_month'].nunique()
    
    print(f"""
üìä ESTRUCTURA DE COHORTES:
   Total registros: {len(vintage_curves):,}
   Cohortes (vintage months): {n_vintages}
   Edad m√°xima (meses): {vintage_curves[age_col].max()}
   
üìà TIPO DE AN√ÅLISIS:
   - Permite analizar evoluci√≥n de cada cohort a lo largo del tiempo
   - Cada fila = 1 snapshot de la cohort en cierta edad
   - Compara desempe√±o entre cohortes originadas en diferentes meses
   
‚ö†Ô∏è VALIDACI√ìN:
   - Verificar que recovery rates sean mon√≥tonos (no decrecen con edad)
   - Verificar que cohortes m√°s viejas tengan m√°s edades que m√°s nuevas
   - Comparar performance vs tabla de loans (¬øson consistentes?)
""")
else:
    print("\n‚ö†Ô∏è Falta informaci√≥n para conclusiones")