Группируем транзакции клиента по месяцам, выделяем target = кол-во транзакций в след. месяце

In [2]:
import pandas as pd
import numpy as np

df = pd.read_parquet(
    "../data/test_datasets/mbd_dataset/detail/trx/fold=0"
)
df["event_time"] = pd.to_datetime(df["event_time"])
df["year_month"] = df["event_time"].dt.to_period("M")

monthly = (
    df
    .groupby(["client_id", "year_month"])
    .agg(
        total_amount=("amount", "sum"),
        transaction_count=("amount", "count"),
        avg_transaction=("amount", "mean"),
    )
    .reset_index()
    .sort_values(["client_id", "year_month"])
)

monthly["target_next_month"] = (
    monthly
    .groupby("client_id")["total_amount"]
    .shift(-1)
)

monthly = monthly.dropna(subset=["target_next_month"])

In [3]:
monthly.head()

Unnamed: 0,client_id,year_month,total_amount,transaction_count,avg_transaction,target_next_month
0,00098f117ba54c5f21436d0687943b7140c356299f64cf...,2021-09,25214.736328,1,25214.736328,113284.0625
1,00098f117ba54c5f21436d0687943b7140c356299f64cf...,2021-12,113284.0625,1,113284.0625,15687.231445
2,00098f117ba54c5f21436d0687943b7140c356299f64cf...,2022-02,15687.231445,1,15687.231445,61252.972656
3,00098f117ba54c5f21436d0687943b7140c356299f64cf...,2022-04,61252.972656,1,61252.972656,100706.890625
4,00098f117ba54c5f21436d0687943b7140c356299f64cf...,2022-06,100706.890625,2,50353.445312,125656.695312


Посмотрим среднее кол-во транзакций в месяц для клиентов

In [4]:
avg_transactions_per_month = (
    monthly
    .groupby("client_id")["transaction_count"]
    .mean()
    .reset_index(name="avg_transactions_per_month")
)

#avg_transactions_per_month.head()
avg_transactions_per_month.describe()

Unnamed: 0,avg_transactions_per_month
count,19416.0
mean,17.575188
std,57.554588
min,1.0
25%,3.214286
50%,9.0
75%,23.173913
max,7039.043478


Вычислим лаги, rolling

In [5]:
for lag in [1, 3, 6]:
    monthly[f"amount_lag_{lag}"] = (
        monthly.groupby("client_id")["total_amount"].shift(lag)
    )
    monthly[f"count_lag_{lag}"] = (
        monthly.groupby("client_id")["transaction_count"].shift(lag)
    )

monthly["amount_roll_3m"] = (
    monthly.groupby("client_id")["total_amount"]
    .rolling(3).mean().reset_index(level=0, drop=True)
)

monthly["amount_roll_6m"] = (
    monthly.groupby("client_id")["total_amount"]
    .rolling(6).mean().reset_index(level=0, drop=True)
)

In [6]:
monthly = monthly.dropna()

In [7]:
monthly.to_csv("../data/final_dataset/processed/monthly_features.csv", index=False)


In [8]:
monthly.to_parquet("../data/final_dataset/processed/monthly_features.parquet", index=False)


In [9]:
monthly.to_pickle("../data/final_dataset/processed/monthly_features.pkl")
