In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data_preprocessed.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,pharmacy,card_num,receipt_date,product,cashier_num,receipt_num,application,retail_price,receipt_unique,month
0,3,0,2363656,2018-01-05,784213,0,1,379517,530.74,2018-01-05 1 0,1
1,4,0,2363656,2018-01-05,784213,0,1,379517,530.74,2018-01-05 1 0,1
2,7,0,2364030,2018-01-04,768150,0,3,379753,79.31,2018-01-04 3 0,1
3,8,0,2364030,2018-01-04,778137,0,3,379753,217.95,2018-01-04 3 0,1
4,9,0,2364030,2018-01-15,772668,0,4,379956,745.22,2018-01-15 4 0,1


In [3]:
data[["month", "card_num", "application"]]

Unnamed: 0,month,card_num,application
0,1,2363656,379517
1,1,2363656,379517
2,1,2364030,379753
3,1,2364030,379753
4,1,2364030,379956
...,...,...,...
3330950,6,10392076,379411
3330951,6,10541072,379411
3330952,6,10545842,379741
3330953,6,10882494,379699


In [4]:
data1 = (
    data[["month", "card_num", "application", "Unnamed: 0"]]
    .groupby(["month", "card_num", "application"])
    .count()
    .reset_index()
)

In [5]:
data1["target"] = 1

In [6]:
data1

Unnamed: 0.1,month,card_num,application,Unnamed: 0,target
0,1,2,379411,1,1
1,1,2,379440,1,1
2,1,2,379497,1,1
3,1,2,379525,5,1
4,1,2,379646,1,1
...,...,...,...,...,...
2667416,8,403410874,379982,1,1
2667417,8,403410984,379641,1,1
2667418,8,403410984,379860,1,1
2667419,8,403410984,379896,1,1


In [4]:
import xgboost

In [2]:
!pip3 install -target=xgboost xgboost

Collecting xgboost
  Using cached xgboost-1.6.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.7 MB)
Collecting scipy
  Using cached scipy-1.8.1-cp39-cp39-macosx_12_0_universal2.macosx_10_9_x86_64.whl (55.6 MB)
Collecting numpy
  Using cached numpy-1.22.4-cp39-cp39-macosx_10_15_x86_64.whl (17.7 MB)
Installing collected packages: numpy, scipy, xgboost
Successfully installed numpy-1.22.4 scipy-1.8.1 xgboost-1.6.1


In [15]:
data.shape

(3330955, 11)

In [5]:
train_data = data[pd.to_datetime(data["receipt_date"]).dt.month != 8]
test_data = data[pd.to_datetime(data["receipt_date"]).dt.month == 8]

In [6]:
train_data["card_num"].nunique(), train_data["application"].nunique()

(121803, 632)

In [8]:
data["card_num"].nunique(), data["application"].nunique()

(122868, 634)

In [10]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,pharmacy,card_num,receipt_date,product,cashier_num,receipt_num,application,retail_price,receipt_unique,month
0,3,0,2363656,2018-01-05,784213,0,1,379517,530.74,2018-01-05 1 0,1
1,4,0,2363656,2018-01-05,784213,0,1,379517,530.74,2018-01-05 1 0,1
2,7,0,2364030,2018-01-04,768150,0,3,379753,79.31,2018-01-04 3 0,1
3,8,0,2364030,2018-01-04,778137,0,3,379753,217.95,2018-01-04 3 0,1
4,9,0,2364030,2018-01-15,772668,0,4,379956,745.22,2018-01-15 4 0,1


In [11]:
def transform_data(data):
    dataset = data.groupby("card_num").agg(
        {
            "receipt_unique": "nunique",
            "product": "nunique",
            "receipt_date": "nunique",
            "application": "nunique",
            "retail_price": ["mean", "std", "min", "max"],
        }
    )

    # число позиций в чеке в месяц (среднее, минимальное, максимальное)
    dataset["mean_purch_month"] = (
        data.groupby(["card_num", "month"]).size().unstack().fillna(0).mean(axis=1)
    )
    dataset["min_purch_month"] = (
        data.groupby(["card_num", "month"]).size().unstack().fillna(0).min(axis=1)
    )
    dataset["max_purch_month"] = (
        data.groupby(["card_num", "month"]).size().unstack().fillna(0).max(axis=1)
    )

    # сумма покупок в месяц (средняя, минимальная, максимальная)
    dataset["mean_price_month"] = (
        data.groupby(["card_num", "month"])["retail_price"]
        .mean()
        .unstack()
        .fillna(0)
        .mean(axis=1)
    )
    dataset["min_price_month"] = (
        data.groupby(["card_num", "month"])["retail_price"]
        .mean()
        .unstack()
        .fillna(0)
        .min(axis=1)
    )
    dataset["max_price_month"] = (
        data.groupby(["card_num", "month"])["retail_price"]
        .mean()
        .unstack()
        .fillna(0)
        .max(axis=1)
    )

    # число дней походов в аптеку в месяц (среднее и максимальное)
    dataset["mean_days_month"] = (
        data.groupby(["card_num", "month"])["receipt_date"]
        .nunique()
        .unstack()
        .fillna(0)
        .mean(axis=1)
    )
    dataset["max_days_month"] = (
        data.groupby(["card_num", "month"])["receipt_date"]
        .nunique()
        .unstack()
        .fillna(0)
        .max(axis=1)
    )

    # среднее число различных применений, купленных за месяц (одно и то же или куча разных)
    dataset["mean_app_month"] = (
        data.groupby(["card_num", "month"])["application"]
        .nunique()
        .unstack()
        .fillna(0)
        .mean(axis=1)
    )

    # в конце заполним пропуски, тк стандартное отклонение может отсутствовать для покупателей,
    # у которых в train попала лишь одна покупка
    return dataset.fillna(0)

In [13]:
gb_all = transform_data(data)
gb_train = transform_data(train_data)
gb_test = transform_data(test_data)

gb_train.describe()

Unnamed: 0_level_0,receipt_unique,product,receipt_date,application,retail_price,retail_price,retail_price,retail_price,mean_purch_month,min_purch_month,max_purch_month,mean_price_month,min_price_month,max_price_month,mean_days_month,max_days_month,mean_app_month
Unnamed: 0_level_1,nunique,nunique,nunique,nunique,mean,std,min,max,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
count,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0,121803.0
mean,8.921045,18.96019,8.637718,14.420105,598.206869,594.62042,70.474931,2215.61558,3.479172,0.38059,9.761919,365.349459,34.731689,1050.275155,1.23396,3.11182,2.789109
std,10.278539,29.074792,7.7999,12.432415,389.538027,537.920922,166.32147,2078.93224,8.109767,3.676885,15.418914,298.351721,114.815445,984.80804,1.114271,1.994148,3.349421
min,1.0,1.0,1.0,1.0,11.74,0.0,0.01,11.74,0.142857,0.0,1.0,1.677143,0.0,11.74,0.142857,1.0,0.142857
25%,4.0,8.0,4.0,7.0,379.48061,304.161994,10.5,1039.59,1.285714,0.0,5.0,173.089452,0.0,547.138333,0.571429,2.0,1.142857
50%,6.0,14.0,6.0,11.0,513.353587,457.000572,31.06,1631.27,2.285714,0.0,8.0,296.135816,0.0,797.22,0.857143,3.0,2.0
75%,11.0,23.0,11.0,18.0,704.16342,705.869603,72.78,2682.07,4.142857,0.0,12.0,471.690361,0.0,1220.404762,1.571429,4.0,3.428571
max,1310.0,2734.0,207.0,361.0,15955.75,12918.760073,15955.75,48870.37,1265.285714,672.0,2370.0,10044.897571,4397.273333,44236.72,29.571429,31.0,219.714286
