# Model Pareto NBD 

## Explicación detallada 

Instalar previamente desde el notebook
```bash 
!pip install lifetimes==0.11.3 mlflow==3.1.1
```


In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import mlflow
from lifetimes import BetaGeoFitter, GammaGammaFitter

### Carga del Parquet Limpio 

In [4]:
DATA_FILE = Path("../data/processed/clv.parquet")
df = pd.read_parquet(DATA_FILE)

print(df.shape)
df.head()

(805549, 9)


Unnamed: 0,Invoice,InvoiceDate,Customer ID,Country,StockCode,Description,Quantity,Price,Sales
0,489434,2009-12-01 07:45:00,13085,United Kingdom,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,83.4
1,489434,2009-12-01 07:45:00,13085,United Kingdom,79323P,PINK CHERRY LIGHTS,12,6.75,81.0
2,489434,2009-12-01 07:45:00,13085,United Kingdom,79323W,WHITE CHERRY LIGHTS,12,6.75,81.0
3,489434,2009-12-01 07:45:00,13085,United Kingdom,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,100.8
4,489434,2009-12-01 07:45:00,13085,United Kingdom,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,30.0


### Cálculo de variables RFM 

In [5]:
# 1. Calculamos fecha de referencia (cierre del dataset)
analysis_date = df["InvoiceDate"].max() + pd.Timedelta(days=1)

# 2. Agregamos por cliente
rfm = (
    df.groupby("Customer ID")
      .agg(
          frequency   = ("Invoice", "nunique"),
          recency     = ("InvoiceDate", lambda x: (x.max() - x.min()).days),
          T           = ("InvoiceDate", lambda x: (analysis_date - x.min()).days),
          monetary    = ("Sales", "mean"),   # avg sales per transacción
      )
      .astype({"frequency":"int32", "recency":"int32", "T":"int32"})
)

rfm.head()

Unnamed: 0_level_0,frequency,recency,T,monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346,12,400,726,2281.072353
12347,8,402,404,22.266087
12348,5,362,438,39.596078
12349,4,570,589,25.3068
12350,1,0,310,19.670588


### Entrenar BetaGeoFitter

In [6]:
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(rfm["frequency"], rfm["recency"], rfm["T"])

<lifetimes.BetaGeoFitter: fitted with 5878 subjects, a: 0.22, alpha: 37.25, b: 0.65, r: 1.30>

### Pronóstico de transacciones a 6 meses

In [7]:
rfm["pred_purchases_6m"] = bgf.conditional_expected_number_of_purchases_up_to_time(
    180, rfm["frequency"], rfm["recency"], rfm["T"]
)

rfm.head()

Unnamed: 0_level_0,frequency,recency,T,monetary,pred_purchases_6m
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346,12,400,726,2281.072353,0.094033
12347,8,402,404,22.266087,3.519997
12348,5,362,438,39.596078,1.996069
12349,4,570,589,25.3068,1.371208
12350,1,0,310,19.670588,0.018451


### Valor Monetario con GammaGamma

In [8]:
ggf = GammaGammaFitter(penalizer_coef=0.001)
ggf.fit(rfm["frequency"], rfm["monetary"])

rfm["exp_avg_sales"] = ggf.conditional_expected_average_profit(
    rfm["frequency"], rfm["monetary"]
)

# CLV a 6 m = ventas esperadas × ticket esperado
rfm["clv_6m"] = rfm["pred_purchases_6m"] * rfm["exp_avg_sales"]
rfm.head()

Unnamed: 0_level_0,frequency,recency,T,monetary,pred_purchases_6m,exp_avg_sales,clv_6m
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,12,400,726,2281.072353,0.094033,2253.590452,211.912989
12347,8,402,404,22.266087,3.519997,22.433763,78.96677
12348,5,362,438,39.596078,1.996069,39.361902,78.569084
12349,4,570,589,25.3068,1.371208,25.527372,35.00335
12350,1,0,310,19.670588,0.018451,21.195733,0.391081


### Registrar en MLflow

In [9]:
mlflow.set_experiment("clv_pareto_nbd")

with mlflow.start_run(run_name="pareto_nbd_6m"):
    mlflow.log_param("time_horizon_days", 180)
    mlflow.log_param("penalizer", 0.001)
    mlflow.sklearn.log_model(bgf, artifact_path="bgf_model")
    mlflow.sklearn.log_model(ggf, artifact_path="ggf_model")
    mlflow.log_metric("rmse_pred_6m", np.sqrt(((rfm["pred_purchases_6m"] - rfm["frequency"])**2).mean()))
    mlflow.log_artifact(DATA_FILE, artifact_path="data_version")

2025/07/20 10:32:25 INFO mlflow.tracking.fluent: Experiment with name 'clv_pareto_nbd' does not exist. Creating a new experiment.


In [14]:
!mlflow ui

[2025-07-20 10:44:29 -0600] [67416] [INFO] Starting gunicorn 23.0.0
[2025-07-20 10:44:29 -0600] [67416] [INFO] Listening at: http://127.0.0.1:5000 (67416)
[2025-07-20 10:44:29 -0600] [67416] [INFO] Using worker: sync
[2025-07-20 10:44:29 -0600] [67417] [INFO] Booting worker with pid: 67417
[2025-07-20 10:44:30 -0600] [67418] [INFO] Booting worker with pid: 67418
[2025-07-20 10:44:30 -0600] [67419] [INFO] Booting worker with pid: 67419
[2025-07-20 10:44:30 -0600] [67420] [INFO] Booting worker with pid: 67420
^C

Aborted!
[2025-07-20 10:44:54 -0600] [67419] [INFO] Worker exiting (pid: 67419)
[2025-07-20 10:44:54 -0600] [67420] [INFO] Worker exiting (pid: 67420)
[2025-07-20 10:44:54 -0600] [67417] [INFO] Worker exiting (pid: 67417)
[2025-07-20 10:44:54 -0600] [67418] [INFO] Worker exiting (pid: 67418)


### Guardar tabla CLV 

In [12]:
OUTPUT = Path("../data/processed/clv_predictions.parquet")
rfm.reset_index().to_parquet(OUTPUT, index=False)
print("CLV table saved →", OUTPUT)

CLV table saved → ../data/processed/clv_predictions.parquet


#### Source run ID 

03d67cdd60c64c709ca01225464b82e5