# 05_VALIDATE_TRANSFORMATIONS

**Objetivo:** Validar que las transformaciones de dbt son correctas para Q1 2025
- Filtrar datos a Q1 2025 (Enero, Febrero, Marzo)
- Validar deduplicaci√≥n
- Validar c√°lculos de revenue
- Validar manejo de pr√©stamos sin pagos

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Configurar paths - subir hasta encontrar la carpeta 'data'
base_path = Path.cwd()
while not (base_path / 'data' / 'raw').exists() and base_path.parent != base_path:
    base_path = base_path.parent

raw_path = base_path / 'data' / 'raw'
exports_path = base_path / 'data' / 'exports'

print("="*80)
print("üîç VALIDACI√ìN DE TRANSFORMACIONES DBT - Q1 2025")
print("="*80)
print(f"\nüìÅ Paths:")
print(f"   Raw:     {raw_path.exists()}")
print(f"   Exports: {exports_path.exists()}")

In [None]:
print("="*80)
print("üì• CARGANDO Y FILTRANDO DATOS A Q1 2025")
print("="*80)

# === CARGAR RAW ===
loans_raw = pd.read_csv(raw_path / 'AE_challenge_loans.csv')
repayments_raw = pd.read_csv(raw_path / 'AE_challenge_repayments.csv')

# === CARGAR PROCESSED ===
loans_processed_full = pd.read_excel(exports_path / 'loans.xlsx')

print(f"\nüìä DATOS CARGADOS (antes de filtrar):")
print(f"   Raw loans:       {len(loans_raw):,} filas")
print(f"   Raw repayments:  {len(repayments_raw):,} filas")
print(f"   Processed loans: {len(loans_processed_full):,} filas")

# === FILTRAR A Q1 2025 ===
# Encontrar columna de fecha en processed
date_col = [c for c in loans_processed_full.columns if 'disburs' in c.lower() or 'date' in c.lower()][0]
print(f"\n   Columna de fecha: {date_col}")

loans_processed_full[date_col] = pd.to_datetime(loans_processed_full[date_col])
loans_processed_full['vintage_month'] = loans_processed_full[date_col].dt.to_period('M')

# Filtrar SOLO Q1 2025
q1_periods = [pd.Period('2025-01', 'M'), pd.Period('2025-02', 'M'), pd.Period('2025-03', 'M')]
loans_processed = loans_processed_full[loans_processed_full['vintage_month'].isin(q1_periods)].copy()

print(f"\nüìä DESPU√âS DE FILTRAR A Q1 2025:")
print(f"   Processed loans: {len(loans_processed):,} filas")
print(f"\n   Distribuci√≥n por mes:")
print(loans_processed['vintage_month'].value_counts().sort_index())

# Tambi√©n filtrar raw a Q1 2025 para comparaci√≥n
loans_raw['disbursed_date'] = pd.to_datetime(loans_raw['disbursed_date'])
loans_raw['vintage_month'] = loans_raw['disbursed_date'].dt.to_period('M')
loans_raw_q1 = loans_raw[loans_raw['vintage_month'].isin(q1_periods)].copy()

print(f"\n   Raw loans Q1 2025: {loans_raw_q1['loan_id'].nunique():,} pr√©stamos √∫nicos")

In [None]:
print("="*80)
print("TEST 1: ELIMINACI√ìN DE DUPLICADOS")
print("="*80)

prestamos_unicos_raw = loans_raw_q1['loan_id'].nunique()
prestamos_processed = len(loans_processed)

print(f"\nüìä COMPARACI√ìN Q1 2025:")
print(f"   Raw (filas totales):        {len(loans_raw_q1):,}")
print(f"   Raw (pr√©stamos √∫nicos):     {prestamos_unicos_raw:,}")
print(f"   Processed (filas):          {prestamos_processed:,}")
print(f"   Duplicados eliminados:      {len(loans_raw_q1) - prestamos_unicos_raw:,}")

test1_pass = (prestamos_processed == prestamos_unicos_raw)

print(f"\nüîç VALIDACI√ìN:")
print(f"   ¬øUna fila por pr√©stamo? {test1_pass}")

if test1_pass:
    print(f"\n‚úÖ TEST 1: PASS")
else:
    print(f"\n‚ùå TEST 1: FAIL")
    print(f"   Diferencia: {abs(prestamos_processed - prestamos_unicos_raw):,}")

In [None]:
print("="*80)
print("TEST 2: C√ÅLCULO DE REVENUE")
print("="*80)

# IDs de pr√©stamos Q1 2025
q1_loan_ids = set(loans_processed['loan_id'].unique())

# Filtrar repayments a solo Q1 2025
repayments_q1 = repayments_raw[repayments_raw['loan_id'].isin(q1_loan_ids)].copy()

# Calcular revenue desde raw
repayments_q1['revenue_calculated'] = (
    repayments_q1['interestamount_trans'] + 
    repayments_q1['feesamount_trans'] + 
    repayments_q1['penaltyamount_trans'] + 
    repayments_q1['taxoninterestamount_trans'] +
    repayments_q1['taxonfeesamount_trans'] +
    repayments_q1['taxonpenaltyamount_trans']
)

revenue_from_raw = repayments_q1['revenue_calculated'].sum()

# Buscar columna de revenue en processed
revenue_cols = [c for c in loans_processed.columns if 'revenue' in c.lower()]
print(f"\nüìã Columnas de revenue: {revenue_cols}")

if len(revenue_cols) > 0:
    revenue_col = revenue_cols[0]
    revenue_from_processed = loans_processed[revenue_col].sum()
    
    print(f"\nüí∞ REVENUE Q1 2025:")
    print(f"   Calculado desde raw:    ${revenue_from_raw:,.2f}")
    print(f"   En processed ({revenue_col}): ${revenue_from_processed:,.2f}")
    print(f"   Diferencia:             ${abs(revenue_from_raw - revenue_from_processed):,.2f}")
    
    diff_pct = abs(revenue_from_raw - revenue_from_processed) / revenue_from_raw * 100 if revenue_from_raw > 0 else 0
    test2_pass = (diff_pct < 0.1)
    
    print(f"\nüîç VALIDACI√ìN:")
    print(f"   Diferencia: {diff_pct:.4f}%")
    
    if test2_pass:
        print(f"\n‚úÖ TEST 2: PASS")
    else:
        print(f"\n‚ùå TEST 2: FAIL")
else:
    print("\n‚ö†Ô∏è No se encontr√≥ columna de revenue")
    print(f"   Columnas disponibles: {loans_processed.columns.tolist()}")
    revenue_col = None
    revenue_from_processed = 0
    test2_pass = False

In [None]:
print("="*80)
print("TEST 3: PR√âSTAMOS SIN PAGOS")
print("="*80)

# Pr√©stamos con y sin pagos
loans_with_payments_ids = set(repayments_q1['loan_id'].unique())
loans_without_payments = loans_processed[~loans_processed['loan_id'].isin(loans_with_payments_ids)]
loans_with_payments_df = loans_processed[loans_processed['loan_id'].isin(loans_with_payments_ids)]

print(f"\nüìä DISTRIBUCI√ìN:")
print(f"   Total pr√©stamos Q1:    {len(loans_processed):,}")
print(f"   CON pagos:             {len(loans_with_payments_df):,} ({len(loans_with_payments_df)/len(loans_processed)*100:.1f}%)")
print(f"   SIN pagos:             {len(loans_without_payments):,} ({len(loans_without_payments)/len(loans_processed)*100:.1f}%)")

if revenue_col:
    rev_sin_pagos = loans_without_payments[revenue_col].sum()
    rev_con_pagos = loans_with_payments_df[revenue_col].sum()
    
    print(f"\nüí∞ REVENUE:")
    print(f"   Sin pagos: ${rev_sin_pagos:,.2f}")
    print(f"   Con pagos: ${rev_con_pagos:,.2f}")
    
    test3_pass = (rev_sin_pagos == 0) and (rev_con_pagos > 0)
    
    print(f"\nüîç VALIDACI√ìN:")
    print(f"   ¬øRevenue sin pagos = 0? {rev_sin_pagos == 0}")
    print(f"   ¬øRevenue con pagos > 0? {rev_con_pagos > 0}")
    
    if test3_pass:
        print(f"\n‚úÖ TEST 3: PASS")
    else:
        print(f"\n‚ùå TEST 3: FAIL")
else:
    test3_pass = True
    print("\n‚ö†Ô∏è Skipped - no revenue column")

In [None]:
print("="*80)
print("TEST 4: COHORTS/AGREGACIONES")
print("="*80)

# Cargar agregaciones
try:
    agg = pd.read_excel(exports_path / 'fct_agg_performance.xlsx')
    
    # Filtrar a Q1 2025 si tiene columna de mes
    month_cols = [c for c in agg.columns if 'month' in c.lower() or 'vintage' in c.lower()]
    if len(month_cols) > 0:
        month_col = month_cols[0]
        agg[month_col] = pd.to_datetime(agg[month_col]).dt.to_period('M')
        agg_q1 = agg[agg[month_col].isin(q1_periods)]
        print(f"\nüìä Agregaciones filtradas a Q1: {len(agg_q1)} filas")
    else:
        agg_q1 = agg
        print(f"\nüìä Agregaciones: {len(agg_q1)} filas")
    
    print(f"   Columnas: {agg_q1.columns.tolist()}")
    display(agg_q1.head())
    
    # Validar totales
    loan_cols = [c for c in agg_q1.columns if 'loan' in c.lower() and ('count' in c.lower() or 'total' in c.lower())]
    if len(loan_cols) > 0:
        agg_total = agg_q1[loan_cols[0]].sum()
        print(f"\nüí° Total loans en agregaci√≥n: {agg_total:,}")
        print(f"   Total loans en detalle:    {len(loans_processed):,}")
        test4_pass = (agg_total == len(loans_processed))
    else:
        test4_pass = True
        
    if test4_pass:
        print(f"\n‚úÖ TEST 4: PASS")
    else:
        print(f"\n‚ùå TEST 4: FAIL")
        
except Exception as e:
    print(f"\n‚ö†Ô∏è Error cargando agregaciones: {e}")
    test4_pass = True

In [None]:
print("="*80)
print("üìã RESUMEN FINAL - Q1 2025")
print("="*80)

all_tests = [
    ("Eliminaci√≥n de duplicados", test1_pass, f"{prestamos_processed:,} pr√©stamos √∫nicos"),
    ("C√°lculo de revenue", test2_pass, f"${revenue_from_processed:,.2f}"),
    ("Pr√©stamos sin pagos", test3_pass, f"{len(loans_without_payments):,} con revenue=$0"),
    ("Cohorts/Agregaciones", test4_pass, "Totales coinciden"),
]

print()
all_passed = True
for i, (name, passed, detail) in enumerate(all_tests, 1):
    status = "‚úÖ" if passed else "‚ùå"
    print(f"{status} TEST {i}: {name}")
    print(f"         {detail}\n")
    if not passed:
        all_passed = False

print("="*80)
if all_passed:
    print("üéâ TODAS LAS VALIDACIONES PASARON")
else:
    print("‚ö†Ô∏è REVISAR TESTS FALLIDOS")
print("="*80)