# 01_EXPLORE_CUSTOMERS.ipynb
=========================

Objetivo: Explorar y entender el dataset de clientes
- ¬øQu√© informaci√≥n tenemos?
- ¬øCu√°ntos clientes hay?
- ¬øC√≥mo se distribuyen por segmento de riesgo?
- ¬øCu√°l es el costo de adquisici√≥n (CAC)?

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path



In [6]:
# Verificar ubicaci√≥n actual
print(f"Working directory: {Path.cwd()}")

# Usar path absoluto
base_path = Path.cwd().parent if 'analisis_adhoc' in str(Path.cwd()) else Path.cwd()
data_path = base_path / 'data' / 'raw'

print(f"Data path: {data_path}")
print(f"Existe? {data_path.exists()}")

# Cargar datos
customers = pd.read_csv(data_path / 'AE_challenge_customer.csv')

print(f"\n‚úÖ Datos cargados: {len(customers):,} clientes")


Working directory: c:\Users\mijai\kueski-analytics-engineer-mijail
Data path: c:\Users\mijai\kueski-analytics-engineer-mijail\data\raw
Existe? True

‚úÖ Datos cargados: 4,500 clientes


In [9]:

print("="*80)
print("üìä DATASET: CUSTOMERS")
print("="*80)


# 1. ¬øCu√°ntos clientes hay?
print(f"\n1Ô∏è‚É£ Total de clientes: {len(customers):,}")

# 2. ¬øQu√© columnas tengo?
print(f"\n2Ô∏è‚É£ Columnas disponibles:")
for col in customers.columns:
    print(f"   - {col}")

# 3. Ver primeros registros
print(f"\n3Ô∏è‚É£ Primeros 10 clientes:")
display(customers.head(10))

# 4. ¬øHay datos faltantes?
print(f"\n4Ô∏è‚É£ Valores nulos por columna:")
print(customers.isnull().sum())


üìä DATASET: CUSTOMERS

1Ô∏è‚É£ Total de clientes: 4,500

2Ô∏è‚É£ Columnas disponibles:
   - user_id
   - acquisition_date
   - acquisition_cost
   - channel
   - risk_band_production
   - city
   - state

3Ô∏è‚É£ Primeros 10 clientes:


Unnamed: 0,user_id,acquisition_date,acquisition_cost,channel,risk_band_production,city,state
0,643da317c01c1a74c1471eee,2025-02-22,0.0,Online,5.0,Saltillo,Coahuila de Zaragoza
1,64766c3ef6f61287fcc5238c,2025-02-04,0.0,Online,3.0,Puebla,Puebla
2,64ee8f076a795f24b4a472ec,2025-03-24,0.0,Online,4.1,Culiac√°n,Sinaloa
3,669f3087a4bce160d9894b1f,2025-01-30,0.0,Online,3.0,Tec√°mac,Estado de M√©xico
4,66bbd6c2b788989833df35d0,2025-02-16,89.85,Online,5.0,Carmen,Campeche
5,66bfbf6535abf773de7a7a63,2025-03-09,0.0,Online,4.2,Cintalapa,Chiapas
6,66ed7f3ccee259b10724a747,2025-01-17,0.0,Online,3.0,Tec√°mac,Estado de M√©xico
7,6710639760f781c7dc36e55c,2025-02-06,0.0,Online,3.0,Ixtlahuac√°n de los Membrillos,Jalisco
8,673f8a6bb4b56269e3ca3465,2025-02-11,73.8345,Online,5.0,Centla,Tabasco
9,6756551f12376575c387554d,2025-02-01,0.0,Online,4.2,Gustavo A. Madero,Ciudad de M√©xico



4Ô∏è‚É£ Valores nulos por columna:
user_id                 0
acquisition_date        0
acquisition_cost        0
channel                 0
risk_band_production    2
city                    0
state                   0
dtype: int64


In [11]:

# 5. ¬øQu√© es 'risk_band_production'?
print(f"\n5Ô∏è‚É£ Bandas de riesgo (risk_band_production):")
print(customers['risk_band_production'].value_counts().sort_index())
print(f"\nDistribuci√≥n porcentual:")
print((customers['risk_band_production'].value_counts(normalize=True) * 100).round(2).sort_index())

# 6. ¬øCu√°nto cuesta adquirir un cliente (CAC)?
print(f"\n6Ô∏è‚É£ Costo de Adquisici√≥n (CAC):")
print(f"   - Promedio: ${customers['acquisition_cost'].mean():,.2f}")
print(f"   - Mediana: ${customers['acquisition_cost'].median():,.2f}")
print(f"   - M√≠nimo: ${customers['acquisition_cost'].min():,.2f}")
print(f"   - M√°ximo: ${customers['acquisition_cost'].max():,.2f}")

# 7. ¬øEl CAC var√≠a por risk_band?
print(f"\n7Ô∏è‚É£ CAC por banda de riesgo:")
print(customers.groupby('risk_band_production')['acquisition_cost'].agg(['count', 'mean', 'median']).round(2))

# 8. ¬øCu√°ndo se adquirieron?
customers['acquisition_date'] = pd.to_datetime(customers['acquisition_date'])
customers['mes'] = customers['acquisition_date'].dt.to_period('M')

print(f"\n8Ô∏è‚É£ Clientes adquiridos por mes:")
print(customers['mes'].value_counts().sort_index())

print("\n" + "="*80)
print("‚úÖ CONCLUSIONES")
print("="*80)
print(f"""
Total clientes: {len(customers):,}
Bandas de riesgo: {customers['risk_band_production'].nunique()}
CAC promedio: ${customers['acquisition_cost'].mean():,.2f}
Per√≠odo: {customers['mes'].min()} a {customers['mes'].max()}
""")


5Ô∏è‚É£ Bandas de riesgo (risk_band_production):
risk_band_production
1                  17
2                 211
3                1316
4.1               749
4.2              1104
5                1081
missing_score      20
Name: count, dtype: int64

Distribuci√≥n porcentual:
risk_band_production
1                 0.38
2                 4.69
3                29.26
4.1              16.65
4.2              24.54
5                24.03
missing_score     0.44
Name: proportion, dtype: float64

6Ô∏è‚É£ Costo de Adquisici√≥n (CAC):
   - Promedio: $51.02
   - Mediana: $0.00
   - M√≠nimo: $0.00
   - M√°ximo: $1,200.00

7Ô∏è‚É£ CAC por banda de riesgo:
                      count    mean  median
risk_band_production                       
1                        17  157.72     0.0
2                       211   86.52     0.0
3                      1316   70.99     0.0
4.1                     749   54.65     0.0
4.2                    1104   41.96     0.0
5                      1081   25.02     0