In [1]:
import pandas as pd


In [2]:
orders = pd.read_csv("../data/orders.csv")
orders.head()


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [3]:
prior = pd.read_csv("../data/order_products__prior.csv")
prior.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [4]:
# nombre de commandes par client
user_orders = (
    orders
    .groupby("user_id")
    .agg(
        nb_orders=("order_id", "nunique"),
        avg_days_between_orders=("days_since_prior_order", "mean")
    )
    .reset_index()
)

user_orders.head()


Unnamed: 0,user_id,nb_orders,avg_days_between_orders
0,1,11,19.0
1,2,15,16.285714
2,3,13,12.0
3,4,6,17.0
4,5,5,11.5


In [7]:
train = pd.read_csv("../data/order_products__train.csv")

In [8]:
# on crée la cible : 1 si racheté
train["target"] = 1

# jointure commandes → clients
train = train.merge(
    orders[["order_id", "user_id"]],
    on="order_id",
    how="left"
)

train.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,target,user_id
0,1,49302,1,1,1,112108
1,1,11109,2,1,1,112108
2,1,10246,3,0,1,112108
3,1,49683,4,0,1,112108
4,1,43633,5,1,1,112108


In [9]:
dataset = train.merge(
    user_orders,
    on="user_id",
    how="left"
)

dataset = dataset.fillna(0)
dataset.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,target,user_id,nb_orders,avg_days_between_orders
0,1,49302,1,1,1,112108,4,10.333333
1,1,11109,2,1,1,112108,4,10.333333
2,1,10246,3,0,1,112108,4,10.333333
3,1,49683,4,0,1,112108,4,10.333333
4,1,43633,5,1,1,112108,4,10.333333


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [13]:
# on prend un échantillon de produits non rachetés
negatives = prior.sample(n=len(train), random_state=42)

negatives = negatives.merge(
    orders[["order_id", "user_id"]],
    on="order_id",
    how="left"
)

negatives["target"] = 0


In [14]:
negatives = negatives[train.columns]


In [15]:
dataset = pd.concat([train, negatives], ignore_index=True)
dataset = dataset.merge(user_orders, on="user_id", how="left")
dataset = dataset.fillna(0)

dataset["target"].value_counts()


1    1384617
0    1384617
Name: target, dtype: int64

In [16]:
X = dataset[["nb_orders", "avg_days_between_orders"]]
y = dataset["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [17]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)


0.7327660569166496

In [18]:
dataset["score"] = model.predict_proba(X)[:, 1]

threshold = dataset["score"].quantile(0.9)
dataset["targeted"] = (dataset["score"] >= threshold).astype(int)

dataset.groupby("targeted")["target"].mean()


targeted
0    0.467521
1    0.791301
Name: target, dtype: float64