In [None]:
import pandas as pd
import numpy as np

# Charger les données propres
df = pd.read_excel("clean_data/transactions_excluding_returns.xlsx")


df.head()



Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,__source_sheet,UnitPrice,is_cancel,Amount,is_qty_outlier,is_price_outlier,is_outlier
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,Year 2009-2010,0,False,0,False,False,False
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,Year 2009-2010,0,False,0,False,False,False
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,Year 2009-2010,0,False,0,False,False,False
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,Year 2009-2010,0,False,0,False,False,False
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,Year 2009-2010,0,False,0,False,False,False


In [None]:
df.columns = [c.strip() for c in df.columns]

rename_map = {
    "InvoiceNo": "Invoice",
    "Invoice Number": "Invoice",
    "InvoiceDate": "InvoiceDate",
    "Customer ID": "CustomerID",
    "Customer Id": "CustomerID",
    "UnitPrice": "Price",
}

df = df.rename(columns=rename_map)

# Créer Amount si absent
if "Amount" not in df.columns:
    df["Amount"] = df["Quantity"] * df["Price"]

df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df = df.dropna(subset=["CustomerID"])


In [None]:
as_of = df["InvoiceDate"].max()

rfm = df.groupby("CustomerID").agg(
    Recency=("InvoiceDate", lambda s: (as_of - s.max()).days),
    Frequency=("Invoice", "nunique"),
    Monetary=("Amount", "sum")
).reset_index()

rfm.head()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346.0,325,17,0
1,12347.0,1,8,0
2,12348.0,74,5,0
3,12349.0,18,5,0
4,12350.0,309,1,0


In [None]:
# Classement
rfm["R_rank"] = rfm["Recency"].rank(method="first", ascending=True)
rfm["F_rank"] = rfm["Frequency"].rank(method="first", ascending=True)
rfm["M_rank"] = rfm["Monetary"].rank(method="first", ascending=True)

# Convertir en quintiles
rfm["R"] = pd.qcut(rfm["R_rank"], 5, labels=[5,4,3,2,1]).astype(int)
rfm["F"] = pd.qcut(rfm["F_rank"], 5, labels=[1,2,3,4,5]).astype(int)
rfm["M"] = pd.qcut(rfm["M_rank"], 5, labels=[1,2,3,4,5]).astype(int)

rfm["RFM_Score"] = rfm["R"]*100 + rfm["F"]*10 + rfm["M"]


In [None]:
rfm["Segment"] = np.where(
    (rfm["R"]>=4) & (rfm["F"]>=4) & (rfm["M"]>=4), "Champions",
    np.where((rfm["R"]>=4) & (rfm["F"]>=3), "Loyal",
    np.where((rfm["R"]<=2) & (rfm["F"]>=4), "At-risk",
    np.where((rfm["R"]>=4) & (rfm["F"]<=2), "Promising", "Others")))
)

rfm["Segment"].value_counts()


Segment
Others       3188
Loyal        1309
Champions     608
Promising     458
At-risk       373
Name: count, dtype: int64

In [None]:
df["InvoiceMonth"] = df["InvoiceDate"].dt.to_period("M")
cust_month = df.groupby(["InvoiceMonth", "CustomerID"])["Amount"].sum().reset_index()

ARPU = cust_month["Amount"].mean()
ARPU


np.float64(0.0)

In [None]:
active = cust_month.groupby("InvoiceMonth")["CustomerID"].nunique().sort_index()

ratios = []
vals = active.values
for i in range(len(vals)-1):
    if vals[i] > 0:
        ratios.append(vals[i+1] / vals[i])

r = np.mean(ratios)
r


np.float64(1.0147779969726132)

In [None]:
def clv_closed(ARPU, r, d=0.01):
    return ARPU * (r / (1 + d - r))

CLV_global = clv_closed(ARPU, r)
CLV_global



np.float64(-0.0)

In [None]:
cust_seg = cust_month.merge(rfm[["CustomerID","Segment"]], on="CustomerID", how="left")

ARPU_seg = cust_seg.groupby("Segment")["Amount"].mean()

CLV_seg = ARPU_seg.apply(lambda x: clv_closed(x, r))

CLV_seg


Segment
At-risk     -0.0
Champions   -0.0
Loyal       -0.0
Others      -0.0
Promising   -0.0
Name: Amount, dtype: float64

In [None]:
rfm.to_excel("clean_data/customers_rfm.xlsx", index=False)
rfm.to_parquet("clean_data/customers_rfm.parquet", index=False)

print("Fichiers sauvegardés dans clean_data/")


Fichiers sauvegardés dans clean_data/
