In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier

#from lifetimes.utils import summary_data_from_transaction_data
#from lifetimes import BetaGeoFitter


# Чтение данных

In [2]:
train = pd.read_csv('train.csv').set_index("user_id")
train

Unnamed: 0_level_0,target,time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0,77
13,0,86
37,0,89
41,0,57
42,0,84
...,...,...
561824,0,91
562043,0,75
562312,0,91
562721,0,29


In [3]:
report_dates = pd.read_csv('report_dates.csv', parse_dates=['report_dt']).set_index("report")
report_dates["report_dt"] = report_dates["report_dt"].astype('datetime64[ns]')
report_dates

Unnamed: 0_level_0,report_dt
report,Unnamed: 1_level_1
1,2022-07-31 03:00:00
2,2022-08-31 03:00:00
3,2022-09-30 03:00:00
4,2022-10-31 03:00:00
5,2022-11-30 03:00:00
6,2022-12-31 03:00:00
7,2023-01-31 03:00:00
8,2023-02-28 03:00:00
9,2023-03-31 03:00:00
10,2023-04-30 03:00:00


In [4]:
%%time

clients = pd.read_csv('clients.csv').set_index("user_id")
#clients.replace({'employee_count_nm':{'ОТ 101 ДО 500':4,'БОЛЕЕ 1001':6,'ОТ 501 ДО 1000':5,'ДО 10':0,
#                                      'ОТ 11 ДО 50':2,'ОТ 51 ДО 100':3,'БОЛЕЕ 500':5,'ОТ 11 ДО 30':2,
#                                      'ОТ 31 ДО 50':2}}, inplace=True)
clients.replace({'employee_count_nm':{'ОТ 101 ДО 500':4,'БОЛЕЕ 1001':5,'ОТ 501 ДО 1000':5,'ДО 10':1,
                                      'ОТ 11 ДО 50':2,'ОТ 51 ДО 100':3,'БОЛЕЕ 500':5,'ОТ 11 ДО 30':2,
                                      'ОТ 31 ДО 50':2}}, inplace=True)
#clients.replace({'employee_count_nm':{'ОТ 101 ДО 500':6,'БОЛЕЕ 1001':9,'ОТ 501 ДО 1000':7,'ДО 10':1,
#                                      'ОТ 11 ДО 50':3,'ОТ 51 ДО 100':5,'БОЛЕЕ 500':8,'ОТ 11 ДО 30':2,
#                                      'ОТ 31 ДО 50':4}}, inplace=True)
clients = clients.join(report_dates, on="report", how="left")
clients

CPU times: user 66.2 ms, sys: 28.9 ms, total: 95.2 ms
Wall time: 105 ms


Unnamed: 0_level_0,report,employee_count_nm,bankemplstatus,customer_age,report_dt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,2,4.0,0,3,2022-08-31 03:00:00
9,1,5.0,0,3,2022-07-31 03:00:00
13,6,5.0,0,2,2022-12-31 03:00:00
37,5,5.0,0,2,2022-11-30 03:00:00
41,1,4.0,0,2,2022-07-31 03:00:00
...,...,...,...,...,...
562043,12,,0,2,2023-06-30 03:00:00
562205,12,,0,1,2023-06-30 03:00:00
562312,12,,0,0,2023-06-30 03:00:00
562721,12,,0,2,2023-06-30 03:00:00


In [5]:
clients["employee_count_nm"].value_counts()

employee_count_nm
5.0    26310
4.0    14362
2.0     7751
3.0     7314
1.0     3797
Name: count, dtype: int64

In [6]:
%%time
transactions = pd.read_csv('transactions.csv.zip', 
                           parse_dates=['transaction_dttm'], 
                           low_memory=False, compression='zip')
transactions

CPU times: user 17.8 s, sys: 1.05 s, total: 18.9 s
Wall time: 18.9 s


Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,3,3,1,-183.883957,2022-01-28 12:05:33
1,3,3,1,-3206.437012,2022-01-28 12:52:30
2,3,16,1,-153866.890625,2022-02-16 14:45:56
3,3,56,1,-15144.601562,2022-03-09 19:58:29
4,3,0,1,5297.908691,2022-03-12 18:11:31
...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49


# Расчет дополнительных свойств

In [7]:
def logsumabs(x):
    return (np.log(x.abs().sum()))
temp_tran = transactions.groupby(['user_id','currency_rk'])['transaction_amt'].agg([logsumabs])
temp_tran = temp_tran.unstack('currency_rk').fillna(0)
temp_tran.columns = ['logsumabs_0','logsumabs_1','logsumabs_2','logsumabs_3']
temp_tran

Unnamed: 0_level_0,logsumabs_0,logsumabs_1,logsumabs_2,logsumabs_3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,0.00000,12.789712,0.0,0.0
9,0.00000,12.686752,0.0,0.0
13,9.28478,12.825245,0.0,0.0
37,0.00000,12.775178,0.0,0.0
41,0.00000,11.595303,0.0,0.0
...,...,...,...,...
562043,0.00000,10.294896,0.0,0.0
562205,0.00000,10.701664,0.0,0.0
562312,0.00000,9.827568,0.0,0.0
562721,0.00000,12.357880,0.0,0.0


In [8]:
%%time
transactions['transaction_dttm'] = transactions['transaction_dttm'].astype('datetime64[ns]')
transactions['dweek'] = transactions['transaction_dttm'].dt.dayofweek# .day//8
transactions['date'] = transactions['transaction_dttm'].dt.date.astype('datetime64[ns]') # .day//8
transactions['sp'] = np.where( transactions['transaction_amt']>0,transactions['transaction_amt'],0)
transactions['sm'] = np.where( transactions['transaction_amt']<0,transactions['transaction_amt'],0)
transactions['ss'] = transactions['sp'] - transactions['sm'] 
transactions

CPU times: user 3.42 s, sys: 368 ms, total: 3.79 s
Wall time: 3.65 s


Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,dweek,date,sp,sm,ss
0,3,3,1,-183.883957,2022-01-28 12:05:33,4,2022-01-28,0.000000,-183.883957,183.883957
1,3,3,1,-3206.437012,2022-01-28 12:52:30,4,2022-01-28,0.000000,-3206.437012,3206.437012
2,3,16,1,-153866.890625,2022-02-16 14:45:56,2,2022-02-16,0.000000,-153866.890625,153866.890625
3,3,56,1,-15144.601562,2022-03-09 19:58:29,2,2022-03-09,0.000000,-15144.601562,15144.601562
4,3,0,1,5297.908691,2022-03-12 18:11:31,5,2022-03-12,5297.908691,0.000000,5297.908691
...,...,...,...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,0,2023-03-20,0.000000,-2484.366211,2484.366211
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,0,2023-03-20,0.000000,-187.658463,187.658463
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,0,2023-03-20,0.000000,-891.933350,891.933350
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,0,2023-03-20,0.000000,-464.467316,464.467316


In [9]:
transactions = transactions.join(clients ,on="user_id", how="left")
transactions['diff_days'] = (transactions['report_dt']-transactions['transaction_dttm']).dt.days
transactions['diff_days2'] = (transactions['report_dt']-pd.Timedelta("100 days")-transactions['transaction_dttm']).dt.days
transactions['m'] = (transactions['report_dt'].dt.year-
                     transactions['transaction_dttm'].dt.year)*12+(transactions['report_dt'].dt.month-
                                                                   transactions['transaction_dttm'].dt.month)
transactions['d'] = (transactions['report_dt'].dt.date-transactions['transaction_dttm'].dt.date)/pd.Timedelta("1 day")
transactions['W'] = transactions['d']//7
transactions

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,dweek,date,sp,sm,ss,report,employee_count_nm,bankemplstatus,customer_age,report_dt,diff_days,diff_days2,m,d,W
0,3,3,1,-183.883957,2022-01-28 12:05:33,4,2022-01-28,0.000000,-183.883957,183.883957,2,4.0,0,3,2022-08-31 03:00:00,214,114,7,215.0,30.0
1,3,3,1,-3206.437012,2022-01-28 12:52:30,4,2022-01-28,0.000000,-3206.437012,3206.437012,2,4.0,0,3,2022-08-31 03:00:00,214,114,7,215.0,30.0
2,3,16,1,-153866.890625,2022-02-16 14:45:56,2,2022-02-16,0.000000,-153866.890625,153866.890625,2,4.0,0,3,2022-08-31 03:00:00,195,95,6,196.0,28.0
3,3,56,1,-15144.601562,2022-03-09 19:58:29,2,2022-03-09,0.000000,-15144.601562,15144.601562,2,4.0,0,3,2022-08-31 03:00:00,174,74,5,175.0,25.0
4,3,0,1,5297.908691,2022-03-12 18:11:31,5,2022-03-12,5297.908691,0.000000,5297.908691,2,4.0,0,3,2022-08-31 03:00:00,171,71,5,172.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,0,2023-03-20,0.000000,-2484.366211,2484.366211,12,,0,0,2023-06-30 03:00:00,101,1,3,102.0,14.0
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,0,2023-03-20,0.000000,-187.658463,187.658463,12,,0,0,2023-06-30 03:00:00,101,1,3,102.0,14.0
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,0,2023-03-20,0.000000,-891.933350,891.933350,12,,0,0,2023-06-30 03:00:00,101,1,3,102.0,14.0
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,0,2023-03-20,0.000000,-464.467316,464.467316,12,,0,0,2023-06-30 03:00:00,101,1,3,102.0,14.0


In [10]:
count_trans = transactions.groupby(['user_id']
    ).agg({
        'transaction_amt':[
            "sum",
            "max",
            "min",
          #  "median",
            "count",
            #"last"
            ],
        "mcc_code":[
            #"last",
            "nunique"
            ],
        "currency_rk":[
#            "min",
#            "max",
            "nunique"
            ],
        'date':[
            "min",
            "max",
            "nunique",
#            "count"
            ],
        'diff_days':[
            "min",
            "max",
            ],
#        'diff_days2':[
#            "min",
#            "max",
#            ],
        "sp":"sum",
        "sm":"sum",
        "ss":"sum"
        })
count_trans.columns = count_trans.columns.map('_'.join).map(lambda x: "count_trans_"+str(x))
count_trans["count_trans_dates"]=(count_trans["count_trans_date_max"]-count_trans["count_trans_date_min"]).dt.days
count_trans

Unnamed: 0_level_0,count_trans_transaction_amt_sum,count_trans_transaction_amt_max,count_trans_transaction_amt_min,count_trans_transaction_amt_count,count_trans_mcc_code_nunique,count_trans_currency_rk_nunique,count_trans_date_min,count_trans_date_max,count_trans_date_nunique,count_trans_diff_days_min,count_trans_diff_days_max,count_trans_sp_sum,count_trans_sm_sum,count_trans_ss_sum,count_trans_dates
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3,13706.416641,104011.960938,-153866.890625,11,4,1,2022-01-28,2022-05-14,8,108,214,186108.229797,-172401.813156,358510.042953,106
9,-323434.666813,-45.579891,-90147.617188,90,22,1,2021-10-20,2022-04-19,54,102,283,0.000000,-323434.666813,323434.666813,181
13,-124717.379150,70322.828125,-58740.300781,22,4,2,2022-03-23,2022-09-07,18,114,282,128766.684326,-253484.063477,382250.747803,168
37,-331859.599463,5487.140625,-35782.984375,315,28,1,2022-02-19,2022-08-17,129,104,283,10738.788574,-342598.388037,353337.176611,179
41,-108586.614166,-290.766998,-16841.208984,16,5,1,2021-11-16,2022-04-18,12,103,256,0.000000,-108586.614166,108586.614166,153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,-29581.256115,-40.769005,-13020.519531,37,12,1,2022-10-06,2023-02-07,23,142,266,0.000000,-29581.256115,29581.256115,124
562205,-40491.908630,1595.461060,-6220.171387,151,15,1,2022-09-22,2023-03-20,85,102,280,1968.906334,-42460.814964,44429.721298,179
562312,-18537.821270,-28.292030,-1372.377075,56,7,1,2022-09-22,2023-02-27,39,122,280,0.000000,-18537.821270,18537.821270,158
562721,-164004.761685,5412.773926,-18981.269531,85,11,1,2022-09-22,2023-03-16,55,105,280,34391.163893,-198395.925579,232787.089472,175


In [11]:
wsum = transactions.pivot_table(values="transaction_amt",
    index="user_id",
    columns="W",
    #aggfunc='sum',
    #aggfunc='count',
    aggfunc=['sum','count'],
    fill_value=0,)
wsum.columns = list( map(lambda x: "wsum"+str(x), wsum.columns ) )
wsum

Unnamed: 0_level_0,"wsum('sum', 14.0)","wsum('sum', 15.0)","wsum('sum', 16.0)","wsum('sum', 17.0)","wsum('sum', 18.0)","wsum('sum', 19.0)","wsum('sum', 20.0)","wsum('sum', 21.0)","wsum('sum', 22.0)","wsum('sum', 23.0)",...,"wsum('count', 31.0)","wsum('count', 32.0)","wsum('count', 33.0)","wsum('count', 34.0)","wsum('count', 35.0)","wsum('count', 36.0)","wsum('count', 37.0)","wsum('count', 38.0)","wsum('count', 39.0)","wsum('count', 40.0)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.000000,109398.959961,28335.613281,37991.929688,0.000000,0.000000,5083.818176,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
9,-7288.300293,0.000000,0.000000,-6001.185471,-4870.835632,0.000000,-4699.906494,-152710.580532,-50746.869873,0.000000,...,2,6,2,2,3,0,1,4,10,4
13,0.000000,0.000000,-5588.771484,0.000000,0.000000,-10642.210938,-5500.924805,0.000000,0.000000,-17234.970703,...,0,1,0,2,1,0,1,0,2,1
37,0.000000,-2249.624603,-2430.073532,-33632.361629,-10154.717846,-9635.188953,-10085.129723,-6322.290035,-13862.676928,-39768.943199,...,23,9,10,6,10,9,12,5,4,7
41,-6226.305176,-1819.140625,0.000000,0.000000,0.000000,-17708.008820,0.000000,0.000000,0.000000,-17190.941895,...,1,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-142.056610,0.000000,0.000000,0.000000,...,8,6,12,3,0,0,0,2,0,0
562205,-7326.375214,-380.008850,-1062.786255,-1604.581909,-1879.877398,-2470.596825,-1697.879690,-3913.901699,-597.464371,-905.776569,...,4,4,2,15,8,3,4,1,4,4
562312,0.000000,0.000000,0.000000,-1485.868103,-540.218151,-618.548569,0.000000,-384.006012,-978.268524,0.000000,...,4,0,2,1,12,1,1,0,4,3
562721,0.000000,-20904.990967,0.000000,0.000000,0.000000,-307.612793,-575.461670,-5032.040283,-11202.968933,-9636.037415,...,8,2,8,0,15,8,5,0,1,1


In [None]:
wsum["wsum('count', 14.0)"]

In [12]:
dsum = transactions.pivot_table(values="transaction_amt",
    index="user_id",
    columns="d",
    #aggfunc='sum',
    #aggfunc='count',
    aggfunc=['sum','count'],
    fill_value=0,)
dsum.columns = list( map(lambda x: "dsum"+str(x), dsum.columns ) )
dsum

Unnamed: 0_level_0,"dsum('sum', 102.0)","dsum('sum', 103.0)","dsum('sum', 104.0)","dsum('sum', 105.0)","dsum('sum', 106.0)","dsum('sum', 107.0)","dsum('sum', 108.0)","dsum('sum', 109.0)","dsum('sum', 110.0)","dsum('sum', 111.0)",...,"dsum('count', 275.0)","dsum('count', 276.0)","dsum('count', 277.0)","dsum('count', 278.0)","dsum('count', 279.0)","dsum('count', 280.0)","dsum('count', 281.0)","dsum('count', 282.0)","dsum('count', 283.0)","dsum('count', 284.0)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,109398.959961,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
9,0.000000,-7288.300293,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,2,1,1,1,1,1,1,1,0,1
13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,1,0
37,0.000000,0.000000,0.000000,-2030.448853,-219.175751,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,1,0,2,0,2,2,1,2
41,0.000000,0.000000,-6226.305176,0.000000,-1819.140625,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
562205,-422.024323,-324.048462,-6580.302429,0.000000,0.000000,-9.379028,-690.546448,0.000000,0.000000,319.916626,...,1,0,1,1,0,3,1,0,0,0
562312,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,1,0,0,1,2,0,0,0
562721,0.000000,0.000000,0.000000,0.000000,-3608.912842,0.000000,-17296.078125,0.000000,0.000000,0.000000,...,0,1,0,0,0,0,1,0,0,0


In [13]:
# Сумма положительный транзакций
#sum_trans_p = transactions[transactions['transaction_amt']>0].groupby(['user_id']
#    ).agg({'transaction_amt':["count","mean"]})#.rename(columns={"transaction_amt":"sum_trans_p"})
#sum_trans_p.columns = sum_trans_p.columns.map('_'.join).map(lambda x: "sump_"+str(x))
#sum_trans_p

In [14]:
# Сумма отрицательных транзакций
#sum_trans_m = transactions[transactions['transaction_amt']<0].groupby(['user_id']
#    ).agg({'transaction_amt':["count","mean"]})#.rename(columns={"transaction_amt":"sum_trans_p"})
#sum_trans_m.columns = sum_trans_m.columns.map('_'.join).map(lambda x: "summ_"+str(x))
#sum_trans_m

In [15]:
msump = transactions.pivot_table(values="sp",                              
    index="user_id",
    columns="m",
    aggfunc='sum',
    fill_value=0,)
msump.columns = list( map(lambda x: "msump"+str(x), msump.columns ) )
msump

Unnamed: 0_level_0,msump3,msump4,msump5,msump6,msump7,msump8,msump9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,137734.573242,43075.747864,5297.908691,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13,10805.421875,10952.112305,0.000000,86198.161377,10038.188965,10772.799805,0.000000
37,0.000000,0.000000,5487.140625,5251.647949,0.000000,0.000000,0.000000
41,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
562043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
562205,319.916626,0.000000,0.000000,0.000000,0.000000,0.000000,1648.989708
562312,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
562721,0.000000,5607.572266,7369.252441,4826.499023,13006.508034,3581.332129,0.000000


In [16]:
msumm = transactions.pivot_table(values="sm",
    index="user_id",
    columns="m",
    aggfunc='sum',
    fill_value=0,)
msumm.columns = list( map(lambda x: "msumm"+str(x), msumm.columns ) )
msumm

Unnamed: 0_level_0,msumm3,msumm4,msumm5,msumm6,msumm7,msumm8,msumm9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,0.000000,0.000000,-15144.601562,-153866.890625,-3390.320969,0.000000,0.000000
9,-13289.485764,-104223.187939,-109800.056656,-54146.384140,-18500.321304,-12387.577271,-11087.653740
13,-16394.193359,-27095.248047,-27650.769531,-76186.732422,-33908.388672,-53024.051758,-19224.679688
37,-32719.820328,-41364.700729,-67081.604048,-53322.076138,-69769.034760,-50056.823341,-28284.328693
41,-8045.445801,-17708.008820,-30354.633301,-17462.199585,-18175.117676,-16841.208984,0.000000
...,...,...,...,...,...,...,...
562043,0.000000,-142.056610,0.000000,-2984.231926,-11080.026783,-15374.940796,0.000000
562205,-10452.779190,-7546.495855,-6601.707232,-4821.073084,-6082.155629,-5607.360139,-1349.243835
562312,0.000000,-2644.634823,-1362.274536,-3165.544842,-3938.301605,-4495.630554,-2931.434910
562721,-20904.990967,-12082.776123,-27648.169678,-64254.888449,-30059.568512,-42680.736959,-764.794891


In [17]:
msum = transactions.pivot_table(values="transaction_amt",
    index="user_id",
    columns=["m","currency_rk"],
    #aggfunc='sum',
    aggfunc='count',
    fill_value=0,)
msum.columns = list( map(lambda x: "msum"+str(x), msum.columns ) )
msum

Unnamed: 0_level_0,"msum(3, 0)","msum(3, 1)","msum(3, 2)","msum(3, 3)","msum(4, 0)","msum(4, 1)","msum(4, 2)","msum(4, 3)","msum(5, 0)","msum(5, 1)",...,"msum(7, 2)","msum(7, 3)","msum(8, 0)","msum(8, 1)","msum(8, 2)","msum(8, 3)","msum(9, 0)","msum(9, 1)","msum(9, 2)","msum(9, 3)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,3,0,0,0,3,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,8,0,0,0,15,0,0,0,9,...,0,0,0,9,0,0,0,14,0,0
13,0,2,0,0,0,3,0,0,0,2,...,0,0,1,3,0,0,0,1,0,0
37,0,15,0,0,0,77,0,0,0,67,...,0,0,0,37,0,0,0,10,0,0
41,0,2,0,0,0,4,0,0,0,3,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0,0,0,0,0,1,0,0,0,0,...,0,0,0,4,0,0,0,0,0,0
562205,0,12,0,0,0,29,0,0,0,33,...,0,0,0,23,0,0,0,8,0,0
562312,0,0,0,0,0,10,0,0,0,5,...,0,0,0,14,0,0,0,7,0,0
562721,0,2,0,0,0,5,0,0,0,10,...,0,0,0,28,0,0,0,2,0,0


In [18]:
smc = transactions.pivot_table(values="sm",
    index="user_id",
    columns=["m","currency_rk"],
    aggfunc='count',
    fill_value=0,)
smc.columns = list( map(lambda x: "smc"+str(x), smc.columns ) )
smc

Unnamed: 0_level_0,"smc(3, 0)","smc(3, 1)","smc(3, 2)","smc(3, 3)","smc(4, 0)","smc(4, 1)","smc(4, 2)","smc(4, 3)","smc(5, 0)","smc(5, 1)",...,"smc(7, 2)","smc(7, 3)","smc(8, 0)","smc(8, 1)","smc(8, 2)","smc(8, 3)","smc(9, 0)","smc(9, 1)","smc(9, 2)","smc(9, 3)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,3,0,0,0,3,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,8,0,0,0,15,0,0,0,9,...,0,0,0,9,0,0,0,14,0,0
13,0,2,0,0,0,3,0,0,0,2,...,0,0,1,3,0,0,0,1,0,0
37,0,15,0,0,0,77,0,0,0,67,...,0,0,0,37,0,0,0,10,0,0
41,0,2,0,0,0,4,0,0,0,3,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0,0,0,0,0,1,0,0,0,0,...,0,0,0,4,0,0,0,0,0,0
562205,0,12,0,0,0,29,0,0,0,33,...,0,0,0,23,0,0,0,8,0,0
562312,0,0,0,0,0,10,0,0,0,5,...,0,0,0,14,0,0,0,7,0,0
562721,0,2,0,0,0,5,0,0,0,10,...,0,0,0,28,0,0,0,2,0,0


In [19]:
spc = transactions.pivot_table(values="sp",
    index="user_id",
    columns="m",
    aggfunc='count',
    fill_value=0,)
spc.columns = list( map(lambda x: "spc"+str(x), spc.columns ) )
spc

Unnamed: 0_level_0,spc3,spc4,spc5,spc6,spc7,spc8,spc9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,3,2,1,2,0,0
9,8,15,9,23,12,9,14
13,2,3,2,6,4,4,1
37,15,77,67,54,55,37,10
41,2,4,3,4,2,1,0
...,...,...,...,...,...,...,...
562043,0,1,0,5,27,4,0
562205,12,29,33,20,26,23,8
562312,0,10,5,6,14,14,7
562721,2,5,10,20,18,28,2


In [20]:
#spdc = transactions.pivot_table(values="sp",
#    index="user_id",
#    columns="dweek",
#    aggfunc='count',
#    fill_value=0,)
#spdc.columns = list( map(lambda x: "spdc"+str(x), spdc.columns ) )
#spdc

#spds = transactions.pivot_table(values="sp",
#    index="user_id",
#    columns="dweek",
#    aggfunc='sum',
#    fill_value=0,)
#spds.columns = list( map(lambda x: "spds"+str(x), spds.columns ) )
#spds

In [21]:
#smdc = transactions.pivot_table(values="sm",
#    index="user_id",
#    columns="dweek",
#    aggfunc='count',
#    fill_value=0,)
#smdc.columns = list( map(lambda x: "smdc"+str(x), smdc.columns ) )
#smdc

#smds = transactions.pivot_table(values="sm",
#    index="user_id",
#    columns="dweek",
#    aggfunc='sum',
#    fill_value=0,)
#smds.columns = list( map(lambda x: "smds"+str(x), smds.columns ) )
#smds

In [22]:
# Максимальная дата до отчета, по клиентам
#max_day_trans = transactions.groupby(['user_id'])[['diff_days']].max().rename(columns={"diff_days":"max_day_trans"})
#max_day_trans

In [23]:
# Минимальная дата до отчета, по клиентам
#min_day_trans = transactions.groupby(['user_id']
#            )[['diff_days']].min().rename(columns={"diff_days":"min_day_trans"})
#min_day_trans

In [24]:
mcc_trans = transactions.pivot_table(    
    values="transaction_amt",
    index="user_id",
    columns="mcc_code",
    aggfunc="sum",
    #aggfunc=["sum","median"],
    #aggfunc=[logsumabs],
    fill_value=0,)
mcc_trans.columns = list( map(lambda x: "mcc"+str(x), mcc_trans.columns ) )
mcc_trans

Unnamed: 0_level_0,mcc0,mcc1,mcc2,mcc3,mcc4,mcc6,mcc7,mcc8,mcc9,mcc10,...,mcc372,mcc382,mcc389,mcc392,mcc407,mcc412,mcc413,mcc424,mcc434,mcc449
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,186108.229797,0.000000,0.000000,-3390.320969,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,-20556.742374,-8324.033737,-192534.730225,-1664.872650,-2037.417374,0.000000,-5204.533508,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.000000,0.000000,0.000000,-160818.399902,0.000000,0.000000,-5997.509766,0.000000,10805.421875,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,10738.788574,-35943.518551,-26588.514793,-214788.286194,-126.869576,-653.359955,0.000000,-1031.058945,-26518.769531,-924.123138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.000000,-740.059052,0.000000,-70219.408691,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0.000000,-7254.742481,0.000000,-13020.519531,0.000000,-1303.106201,-786.304443,-870.556900,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562205,-10868.047920,-12028.663424,-1157.922058,0.000000,-129.403091,-6800.897217,0.000000,-2645.133282,0.000000,-770.019485,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562312,0.000000,-6974.722969,-352.799313,0.000000,-1372.377075,-2114.061485,0.000000,-988.508408,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562721,34391.163893,-5350.361526,-1197.671670,-167531.345428,0.000000,0.000000,0.000000,-80.179829,-11395.134621,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
mcc_transc = transactions.pivot_table(    
    values="transaction_amt",
    index="user_id",
    columns="mcc_code",
    aggfunc='count',
    fill_value=0,)
mcc_transc.columns = list( map(lambda x: "mccc"+str(x), mcc_transc.columns ) )
mcc_transc

Unnamed: 0_level_0,mccc0,mccc1,mccc2,mccc3,mccc4,mccc6,mccc7,mccc8,mccc9,mccc10,...,mccc372,mccc382,mccc389,mccc392,mccc407,mccc412,mccc413,mccc424,mccc434,mccc449
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,7,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,27,11,5,3,2,0,6,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,14,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
37,2,97,129,28,1,2,0,7,1,4,...,0,0,0,0,0,0,0,0,0,0
41,0,2,0,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0,19,0,1,0,1,3,4,0,0,...,0,0,0,0,0,0,0,0,0,0
562205,30,33,2,0,1,10,0,6,0,3,...,0,0,0,0,0,0,0,0,0,0
562312,0,24,3,0,1,6,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
562721,19,12,6,29,0,0,0,2,4,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
mcc_transmc = transactions.pivot_table(    
    values="transaction_amt",
    index="user_id",
    #columns=["m","mcc_code"],
    columns=["mcc_code","m"],
    #aggfunc=['sum','median'],
    aggfunc='sum',
    fill_value=0,
)
mcc_transmc.columns = list( map(lambda x: "mccmc"+str(x), mcc_transmc.columns ) )
mcc_transmc

Unnamed: 0_level_0,"mccmc(0, 3)","mccmc(0, 4)","mccmc(0, 5)","mccmc(0, 6)","mccmc(0, 7)","mccmc(0, 8)","mccmc(0, 9)","mccmc(1, 3)","mccmc(1, 4)","mccmc(1, 5)",...,"mccmc(412, 8)","mccmc(413, 3)","mccmc(413, 4)","mccmc(413, 5)","mccmc(413, 6)","mccmc(413, 7)","mccmc(413, 9)","mccmc(424, 6)","mccmc(434, 4)","mccmc(449, 7)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,137734.573242,43075.747864,5297.908691,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-5323.413376,-2728.293545,-172.254761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.000000,0.000000,5487.140625,5251.647949,0.000000,0.000000,0.000000,-2098.237946,-7730.614010,-4955.007622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-290.766998,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562205,-7609.368988,-1207.410591,-1161.791405,-568.072906,-1511.607269,-458.786469,1648.989708,0.000000,-108.493008,-2985.051830,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562312,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1993.233418,-353.968170,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562721,0.000000,5607.572266,7369.252441,4826.499023,13006.508034,3581.332129,0.000000,0.000000,0.000000,-1540.192139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
%%time
date_trans = transactions.groupby("user_id").agg({"date":"unique"}).explode("date").reset_index()#.sort(["user_id","date"])
date_trans["interval"]=date_trans.groupby("user_id")["date"].diff()
date_trans = date_trans.dropna() 
date_trans["interval"]=date_trans["interval"].dt.days
date_trans = date_trans.groupby("user_id").agg({"interval":["last","max","median","mean"]})
date_trans.columns = list( map(lambda x: "date_"+str(x), date_trans.columns ) )
date_trans

CPU times: user 28.7 s, sys: 533 ms, total: 29.2 s
Wall time: 29.2 s


Unnamed: 0_level_0,"date_('interval', 'last')","date_('interval', 'max')","date_('interval', 'median')","date_('interval', 'mean')"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,9,30,18.0,15.142857
9,17,22,2.0,3.415094
13,2,21,9.0,9.882353
37,1,7,1.0,1.398438
41,2,31,4.0,13.909091
...,...,...,...,...
562043,54,54,1.5,5.636364
562205,1,8,1.0,2.130952
562312,2,37,2.0,4.157895
562721,2,25,1.0,3.240741


In [28]:
%%time
summary = summary_data_from_transaction_data(transactions, 'user_id', 'transaction_dttm', freq="D", monetary_value_col="transaction_amt" )
summary.columns = list( map(lambda x: "summary_"+str(x), summary.columns ) )
summary

# similar API to scikit-learn and lifelines.
bgf = BetaGeoFitter(penalizer_coef=0.00001)
bgf.fit(summary['summary_frequency'], summary['summary_recency'], summary['summary_T'])
print(bgf)

t = 1
summary['predicted_purchases1'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summary['summary_frequency'], 
                                                                                         summary['summary_recency'], 
                                                                                         summary['summary_T']
                                                                                        )
t = 30
summary['predicted_purchases30'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summary['summary_frequency'], 
                                                                                         summary['summary_recency'], 
                                                                                         summary['summary_T']
                                                                                        )
t = 91
summary['predicted_purchases91'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summary['summary_frequency'], 
                                                                                         summary['summary_recency'], 
                                                                                         summary['summary_T']
                                                                                        )
t = 101
summary['predicted_purchases101'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summary['summary_frequency'], 
                                                                                         summary['summary_recency'], 
                                                                                         summary['summary_T']
                                                                                        )
summary

NameError: name 'summary_data_from_transaction_data' is not defined

In [29]:
%%time
summaryW = summary_data_from_transaction_data(transactions, 'user_id', 'transaction_dttm', freq="W", monetary_value_col="transaction_amt" )
summaryW.columns = list( map(lambda x: "summaryW_"+str(x), summaryW.columns ) )
summaryW

# similar API to scikit-learn and lifelines.
bgf = BetaGeoFitter(penalizer_coef=0.00001)
bgf.fit(summaryW['summaryW_frequency'], summaryW['summaryW_recency'], summaryW['summaryW_T'])
print(bgf)

t = 1
summaryW['predicted_purchases1W'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summaryW['summaryW_frequency'], 
                                                                                         summaryW['summaryW_recency'], 
                                                                                         summaryW['summaryW_T']
                                                                                        )
t = 4
summaryW['predicted_purchases30W'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summaryW['summaryW_frequency'], 
                                                                                         summaryW['summaryW_recency'], 
                                                                                         summaryW['summaryW_T']
                                                                                        )
t = 13
summaryW['predicted_purchases91W'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summaryW['summaryW_frequency'], 
                                                                                         summaryW['summaryW_recency'], 
                                                                                         summaryW['summaryW_T']
                                                                                        )
t = 14
summaryW['predicted_purchases101W'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                         summaryW['summaryW_frequency'], 
                                                                                         summaryW['summaryW_recency'], 
                                                                                         summaryW['summaryW_T']
                                                                                        )
summaryW
#summaryh = summary_data_from_transaction_data(transactions, 'user_id', 'transaction_dttm', freq="h", monetary_value_col="transaction_amt" )
#summaryh.columns = list( map(lambda x: "summaryh_"+str(x), summaryh.columns ) )
#summaryh

#summarym = summary_data_from_transaction_data(transactions, 'user_id', 'transaction_dttm', freq="m", monetary_value_col="transaction_amt" )
#summarym.columns = list( map(lambda x: "summarym_"+str(x), summarym.columns ) )
#summarym

NameError: name 'summary_data_from_transaction_data' is not defined

# Итоговая сборка

In [38]:
df = (clients
                # .join(count_trans, on="user_id", how="left"
                #).join(sum_trans, on="user_id", how="left"
                #).join(sum_trans_p, on="user_id", how="left"
                #).join(sum_trans_m, on="user_id", how="left"
                #).join(temp_tran, on="user_id", how="left"
                #).join(max_day_trans, on="user_id", how="left"
                #).join(min_day_trans, on="user_id", how="left"
    
                #).join(msump, on="user_id", how="left"
                ).join(msumm, on="user_id", how="left"
                #).join(msum, on="user_id", how="left"
                #).join(spc, on="user_id", how="left"
                #).join(smc, on="user_id", how="left"
    
                #).join(spdc, on="user_id", how="left"
                #).join(smdc, on="user_id", how="left"
                #).join(spds, on="user_id", how="left"
                #).join(smds, on="user_id", how="left"
                #).join(mcc_trans, on="user_id", how="left"
                ).join(mcc_transc, on="user_id", how="left"
                #).join(mcc_transmc, on="user_id", how="left"
                ).join(date_trans, on="user_id", how="left"
                #).join(summary, on="user_id", how="left"
                #).join(summaryW, on="user_id", how="left"
                #).join(summaryh, on="user_id", how="left"
                #).join(summarym, on="user_id", how="left"
                ).join(wsum, on="user_id", how="left"
                ).join(dsum, on="user_id", how="left"
                       
                ).join(train, on="user_id", how="left"
                )
     )
df['time'] = df['time'].fillna(-1)
df['time'] = df['time'].astype(np.int32)
df['target'] = df['target'].fillna(-1)
df['target'] = df['target'].astype(np.int8)

#df['employee_count_nm'] = df['employee_count_nm'].astype("str")

df

Unnamed: 0_level_0,report,employee_count_nm,bankemplstatus,customer_age,report_dt,count_trans_transaction_amt_sum,count_trans_transaction_amt_max,count_trans_transaction_amt_min,count_trans_transaction_amt_count,count_trans_mcc_code_nunique,...,"dsum('count', 277.0)","dsum('count', 278.0)","dsum('count', 279.0)","dsum('count', 280.0)","dsum('count', 281.0)","dsum('count', 282.0)","dsum('count', 283.0)","dsum('count', 284.0)",target,time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,2,4.0,0,3,2022-08-31 03:00:00,13706.416641,104011.960938,-153866.890625,11,4,...,0,0,0,0,0,0,0,0,0,77
9,1,5.0,0,3,2022-07-31 03:00:00,-323434.666813,-45.579891,-90147.617188,90,22,...,1,1,1,1,1,1,0,1,-1,-1
13,6,5.0,0,2,2022-12-31 03:00:00,-124717.379150,70322.828125,-58740.300781,22,4,...,0,0,0,0,0,0,1,0,0,86
37,5,5.0,0,2,2022-11-30 03:00:00,-331859.599463,5487.140625,-35782.984375,315,28,...,1,0,2,0,2,2,1,2,0,89
41,1,4.0,0,2,2022-07-31 03:00:00,-108586.614166,-290.766998,-16841.208984,16,5,...,0,0,0,0,0,0,0,0,0,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,12,,0,2,2023-06-30 03:00:00,-29581.256115,-40.769005,-13020.519531,37,12,...,0,0,0,0,0,0,0,0,0,75
562205,12,,0,1,2023-06-30 03:00:00,-40491.908630,1595.461060,-6220.171387,151,15,...,1,1,0,3,1,0,0,0,-1,-1
562312,12,,0,0,2023-06-30 03:00:00,-18537.821270,-28.292030,-1372.377075,56,7,...,1,0,0,1,2,0,0,0,0,91
562721,12,,0,2,2023-06-30 03:00:00,-164004.761685,5412.773926,-18981.269531,85,11,...,0,0,0,0,1,0,0,0,0,29


# просмотр корреляции

In [39]:
%%time
#cor=df.corr().abs().unstack().sort_values(ascending=False).dropna().reset_index()
#cor[cor["level_0"]!=cor["level_1"]].head(60)

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 21 µs


# train_test_split

In [40]:
X = df[df['time']!=-1][df.columns[:-2]].copy()
y = df[df['time']!=-1][['target']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=33)
X_train

Unnamed: 0_level_0,report,employee_count_nm,bankemplstatus,customer_age,report_dt,count_trans_transaction_amt_sum,count_trans_transaction_amt_max,count_trans_transaction_amt_min,count_trans_transaction_amt_count,count_trans_mcc_code_nunique,...,"dsum('count', 275.0)","dsum('count', 276.0)","dsum('count', 277.0)","dsum('count', 278.0)","dsum('count', 279.0)","dsum('count', 280.0)","dsum('count', 281.0)","dsum('count', 282.0)","dsum('count', 283.0)","dsum('count', 284.0)"
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159794,9,4.0,0,0,2023-03-31 03:00:00,-336541.865234,6412.399902,-96322.601562,19,3,...,0,0,1,0,0,0,0,0,0,0
285771,7,1.0,0,3,2023-01-31 03:00:00,238796.985224,113105.960938,-26977.500000,182,17,...,1,4,1,1,0,0,0,0,0,0
300185,12,3.0,0,1,2023-06-30 03:00:00,-349157.463165,-213.163147,-56259.628906,48,3,...,0,0,0,0,0,1,0,1,0,0
164093,7,5.0,0,0,2023-01-31 03:00:00,-197241.848011,6340.907715,-27578.789062,384,26,...,3,4,1,6,3,4,1,0,0,0
478970,3,,0,1,2022-09-30 03:00:00,-23757.087471,-27.060711,-2689.582520,89,12,...,0,0,1,1,0,0,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237286,6,5.0,0,1,2022-12-31 03:00:00,96337.004486,90323.328125,-109855.257812,188,33,...,2,0,0,0,1,0,0,1,3,5
4847,1,5.0,0,3,2022-07-31 03:00:00,-158876.900894,18805.158203,-26947.216797,170,27,...,1,1,2,1,1,1,0,0,1,0
334641,12,,0,3,2023-06-30 03:00:00,-244767.816072,9369.488281,-22261.675781,110,11,...,0,4,1,0,2,1,0,1,1,2
20795,1,5.0,0,3,2022-07-31 03:00:00,3347.806419,50652.371094,-10710.046875,64,11,...,0,0,0,0,0,0,0,0,0,0


# Подбор гиперпараметров catboost (iterations, learning_rate, depth, l2_leaf_reg) по графикам

In [None]:
%%time
cat = CatBoostClassifier(
    iterations=13000,    
    learning_rate= 0.00291, 
    depth=10,
    l2_leaf_reg=3,
    custom_metric=['AUC',"Accuracy","Precision","F1"], 
    eval_metric="AUC", 
    random_seed=63+9+4, 
    task_type="GPU", devices='0:1',  #закомментировать строку если отсутствует или настроен GPU 
)
#cat_features = ["bankemplstatus","customer_age", "employee_count_nm"]
cat_features = ["bankemplstatus","customer_age"]
cat.fit(X_train,y_train, eval_set=(X_test, y_test), cat_features=cat_features, verbose=False, plot=True)
cat.best_score_ #0.7766878604888916 -> 0,7768357456 на LB
#0.7766611576080322
#0.7770650386810303
#0.7769526243209839
#0.7775381803512573
#0.7740592
#0.7736922
#0.774281621
#0.7746870219707489
#0.7716798
#0.7775722 d9
#0.77497756 d10
#0.7768340110778809 d8
#0.77380365 d11
#0.7766144871711731 d7
#0.7768862545490265 d6 i10000
#0.7769359052 d6 i14000 l1
#0.7769371569156647 d6 i14000 l2
#0.7773089706897736 d6 i14000 l3
#0.777213454246521 d6 i14000 l4

#0.775470376 d6 i12000 lr0.00691
#0.7744327783584595 d5 i12000 lr0.00691
#0.7745981812477112 d5 i16000 lr0.00691
#0.7747015655040741 d5 i16000 lr0.00691 +currency_rk min max
#0.7756061554 d7 i16000 lr0.00691 
#0.7780717 d7 i16000 lr0.00691 +mcc_transc
#0.7783268 d7 i16000 lr0.00691 +msum 
#0.778216213 d7 i16000 lr0.00691 +temp_tran
#0.778244555 d7 i16000 lr0.00691 -temp_tran
#0.7782809734344482 d8 i12000 lr0.00691 
#0.7784516215324402 d9 i12000 lr0.00691 
#0.77721777 d10 i10000 lr0.00691 
#0.77579286 d11 i10000 lr0.00691 
#0.7787954807281494 d9 i10000 lr0.00691 
#0.7784642279148102 d9 i10000 lr0.00691 +summaryW
#0.7770140171051025 d9 i10000 lr0.00691 -summary
#0.7785941958427429 d9 i10000 lr0.00691 +summary -summaryW
#0.7770629525184631 d9 i10000 lr0.00691 +summarym
#0.778128445148468 d9 i10000 lr0.00691 +summaryh -summarym
#0.7776791155338287 d9 i10000 lr0.00691 +summaryh +summarym
#0.7782390117645264 d9 i10000 lr0.00691 -summaryh -summarym
#0.7782014906406403 d9 i10000 lr0.00491
#0.7778907120227814 d9 i10000 lr0.00391
#0.77840855717659 d9 i10000 lr0.00791
#0.7783661484718323 d9 i8000 lr0.00891
#0.7784208 d9 i8000 lr0.00991
#0.7784562110900879 d9 i5000 lr0.01091
#0.7770333886146545 d9 i5000 lr0.01291
#0.7786065340042114 d9 i5000 lr0.01191
#0.7777549624443054 d9 i5000 lr0.01391
#0.7783006727695465 d9 i5000 lr0.01491
#0.7764320672 d10 i4000 lr0.01491
#0.7767527997493744 d8 i4000 lr0.01491 l2_leaf_reg=3
#0.7785560488700867 d9 i4000 lr0.01491 l2_leaf_reg=2.5
#0.7775040864944458 d9 i4000 lr0.01491 l2_leaf_reg=2
#0.7781819105148315 d9 i4000 lr0.01491 l2_leaf_reg=3.5
#0.7781339883804321 d9 i4000 lr0.01191 l2_leaf_reg=2.5
#0.7767664194107056 d9 i4000 lr0.01291 l2_leaf_reg=2.5
#0.777866542339325 d9 i4000 lr0.01191 l2_leaf_reg=3

#0.7776371836662292
#0.7785174250602722 i4000 lr0.011291 d10 l2_leaf_reg3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


In [34]:
predict = cat.predict_proba(df[df['time']==-1][df.columns[:-2]])[:,1]
submit = df[df['time']==-1].reset_index()[['user_id']].copy()
submit['predict'] = predict
submit.to_csv(f'submission0014_{cat.best_score_["validation"]["AUC"]}.csv',index=False) 
submit

Unnamed: 0,user_id,predict
0,9,0.022137
1,61,0.020554
2,62,0.038717
3,80,0.006026
4,88,0.121730
...,...,...
31995,561362,0.052726
31996,561419,0.057142
31997,561895,0.046536
31998,561908,0.058459


In [35]:
#Значимость признаков
cat.get_feature_importance( prettified=True).head(60), cat.get_feature_importance( prettified=True).iloc[60:120], cat.get_feature_importance( prettified=True).iloc[-480:-420], cat.get_feature_importance( prettified=True).iloc[-420:-360], cat.get_feature_importance( prettified=True).iloc[-360:-300], cat.get_feature_importance( prettified=True).iloc[-300:-240], cat.get_feature_importance( prettified=True).iloc[-240:-180], cat.get_feature_importance( prettified=True).iloc[-180:-120], cat.get_feature_importance( prettified=True).iloc[-120:-60], cat.get_feature_importance( prettified=True).tail(60)

(                           Feature Id  Importances
 0                   employee_count_nm     5.146794
 1                        customer_age     2.961307
 2                  count_trans_ss_sum     1.751914
 3                              msumm3     1.542449
 4                   count_trans_dates     1.540593
 5           count_trans_diff_days_min     1.460714
 6     count_trans_transaction_amt_max     1.283239
 7        count_trans_mcc_code_nunique     1.240690
 8     count_trans_transaction_amt_min     1.214834
 9                               mccc0     1.193723
 10                             msumm4     1.193145
 11           date_('interval', 'max')     1.097067
 12                             report     1.094730
 13                              mccc3     1.071651
 14                  wsum('sum', 15.0)     0.978252
 15                          report_dt     0.957587
 16    count_trans_transaction_amt_sum     0.879130
 17                               mcc3     0.823970
 18         

# Расчет по выбранным гиперпараметрам на полных данных

In [36]:
X_pred = df[df['time']==-1][df.columns[:-2]].copy()
for i in range(20):
    cat = CatBoostClassifier(
        iterations=6000,    
        learning_rate= 0.006291, 
        depth=10,
        l2_leaf_reg=3,
        custom_metric=['AUC',"Accuracy","Precision","F1"], 
        eval_metric="AUC", 
        random_seed=63+9+i, 
        task_type="GPU", devices='0:1'  #закомментировать строку если отсутствует или настроен GPU 
    )
    cat_features = ["bankemplstatus","customer_age"]
    cat.fit(X,y, eval_set=(X_test, y_test), cat_features=cat_features, verbose=False, plot=True)
    print(i, cat.best_score_)
    
    predict = cat.predict_proba(X_pred)[:,1]
    submit = df[df['time']==-1].reset_index()[['user_id']].copy()
    submit['predict'] = predict
    submit.to_csv(f'submission0014_{i}.csv',index=False)    

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


KeyboardInterrupt: 