In [35]:
import pandas as pd
import numpy as np
import sklearn as sk

In [36]:
x_raw = pd.read_csv("data/transactions_train.csv")
x_raw

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [37]:
y_raw = pd.read_csv("data/train_target.csv")
y_raw

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3
...,...,...
29995,14303,1
29996,22301,2
29997,25731,0
29998,16820,3


In [38]:
from functools import reduce

def preprocess_x(x, target):
    counts = x.groupby('client_id')['amount_rur'].count().reset_index()
    print("transactions counted")
    
    agg_features = x.groupby('client_id')['amount_rur'].agg(["min", "max", 'mean','std', "median"]).reset_index()
    agg_features = pd.merge(pd.merge(target, counts, on="client_id"), agg_features, on="client_id")
    agg_features = agg_features.drop("bins", axis=1) if "bins" in agg_features.columns else agg_features
    print("aggregate features collected")
    
    groups = x.groupby(['client_id','small_group'])['amount_rur'].count()
    groups = groups.reset_index().pivot(index='client_id', columns='small_group',values='amount_rur').fillna(0)
    groups.columns = [f"small_group_{i}" for i in groups.columns]
    print("groups found")
    
    to_merge = [target, agg_features, groups]
    print("dataframe list completed")
    
    x = reduce(lambda a, b: pd.merge(a, b, on="client_id"), to_merge).drop("client_id", axis=1)
    x = x.drop("bins", axis=1) if "bins" in x.columns else x
    print("data ready")
    
    return x

x = preprocess_x(x_raw, y_raw)
x

transactions counted
aggregate features collected
groups found
dataframe list completed
data ready


Unnamed: 0,amount_rur,min,max,mean,std,median,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
0,870,0.074,1227.314,34.774725,72.037354,21.6995,0.0,174.0,2.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,818,0.550,1210.506,52.015367,106.540962,20.1500,1.0,187.0,61.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,782,0.043,782.641,34.325852,59.927450,16.9020,0.0,372.0,0.0,72.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,976,0.043,109.590,16.160990,14.224936,12.0730,0.0,359.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,784,0.432,541.165,15.929050,35.473591,8.7065,0.0,378.0,0.0,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,825,1.786,9919.347,345.456274,1009.576868,104.3690,0.0,284.0,26.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,703,0.043,2136.877,49.715962,116.602464,25.6450,2.0,168.0,9.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,1096,0.181,884.970,49.338005,80.490725,22.8215,0.0,266.0,8.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29998,762,0.388,2959.186,42.643615,136.489534,18.3190,2.0,340.0,3.0,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
y = y_raw[["bins"]]
y

Unnamed: 0,bins
0,2
1,0
2,2
3,1
4,3
...,...
29995,1
29996,2
29997,0
29998,3


In [40]:
x_raw_test = pd.read_csv("data/transactions_test.csv")
x_raw_test

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,46445,3,0,19.555
1,46445,3,1,27.774
2,46445,4,0,18.114
3,46445,4,1,22.183
4,46445,5,2,45.795
...,...,...,...,...
17667323,14864,727,1,21.824
17667324,14864,728,36,84.900
17667325,14864,728,3,2.748
17667326,14864,728,1,47.369


In [41]:
id_test = pd.read_csv("data/test.csv")
id_test

Unnamed: 0,client_id
0,28571
1,27046
2,13240
3,19974
4,10505
...,...
19995,2565
19996,31255
19997,31539
19998,4288


In [42]:
x_test = preprocess_x(x_raw_test, id_test)

transactions counted
aggregate features collected
groups found
dataframe list completed
data ready


In [43]:
common_features = [col for col in x.columns if col in x_test.columns]
x_test = x_test[common_features]
x = x[common_features]

In [44]:
x

Unnamed: 0,amount_rur,min,max,mean,std,median,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_202
0,870,0.074,1227.314,34.774725,72.037354,21.6995,0.0,174.0,2.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,818,0.550,1210.506,52.015367,106.540962,20.1500,1.0,187.0,61.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,782,0.043,782.641,34.325852,59.927450,16.9020,0.0,372.0,0.0,72.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,976,0.043,109.590,16.160990,14.224936,12.0730,0.0,359.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,784,0.432,541.165,15.929050,35.473591,8.7065,0.0,378.0,0.0,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,825,1.786,9919.347,345.456274,1009.576868,104.3690,0.0,284.0,26.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,703,0.043,2136.877,49.715962,116.602464,25.6450,2.0,168.0,9.0,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,1096,0.181,884.970,49.338005,80.490725,22.8215,0.0,266.0,8.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29998,762,0.388,2959.186,42.643615,136.489534,18.3190,2.0,340.0,3.0,79.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
x_test

Unnamed: 0,amount_rur,min,max,mean,std,median,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_202
0,718,0.078,306.882,42.488974,43.659666,28.0745,0.0,278.0,13.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1010,0.043,1469.007,38.988135,93.600961,17.0350,9.0,193.0,68.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,942,1.078,3902.918,53.302683,198.583630,16.4635,0.0,227.0,3.0,165.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,852,0.432,5865.551,53.252924,273.597147,18.8575,42.0,305.0,12.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1109,0.043,1921.341,81.950972,119.408720,51.6070,0.0,516.0,56.0,162.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1000,0.051,3202.483,35.397209,139.556590,12.5435,0.0,322.0,25.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19996,1026,0.129,646.914,16.928943,36.466146,6.6810,1.0,239.0,1.0,115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19997,1146,0.043,674.429,23.913882,38.592933,14.3180,0.0,345.0,0.0,53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,814,0.229,1007.405,29.628898,66.501257,13.7380,0.0,278.0,28.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA

qt = QuantileTransformer()
pca = PCA()
x_q = qt.fit_transform(x)
x_pca = pca.fit_transform(xq)
x = pd.DataFrame(x_pca, index=x.index, columns=x.columns)

x

Unnamed: 0,amount_rur,min,max,mean,std,median,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_202
0,-0.692751,-0.348713,0.507952,-0.125851,-0.482752,-0.772513,-0.559274,-0.670380,-0.149303,-0.289686,...,-0.000420,0.000750,-0.000268,-0.000071,0.000174,0.000353,0.001021,1.238515e-04,-0.000056,0.000123
1,-0.043938,1.026978,0.011085,-0.591965,-0.360262,-0.386394,-0.026186,-0.251784,-0.510719,-0.116335,...,0.000344,-0.000294,-0.000766,-0.000105,0.000818,-0.000139,-0.000243,8.742263e-07,-0.000344,-0.000167
2,-0.692218,-0.567780,0.658670,0.124920,0.101400,-0.496257,0.301417,0.480242,-0.582623,-0.396388,...,0.000683,0.000500,-0.000073,-0.000517,0.000331,0.000153,0.000176,-6.921305e-05,0.000009,0.000369
3,-1.906614,0.288190,0.470342,-0.825581,0.355124,0.500159,-0.136486,0.114200,-0.026538,0.073139,...,-0.000332,0.000193,0.000638,-0.001093,0.000038,0.000425,-0.000202,-1.842209e-04,-0.000116,0.000092
4,-2.177758,-0.244619,0.031539,-0.556798,0.546788,-0.413245,-0.381144,0.113215,-0.088755,0.067087,...,0.001282,0.000420,-0.000757,0.000200,0.000061,0.000267,0.000066,-4.965710e-04,0.000379,-0.000070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1.449796,-0.047618,-1.064153,-0.761315,0.525053,-0.919151,0.185089,-0.058149,0.113738,-0.285162,...,0.000655,-0.000533,-0.000889,0.001470,-0.002172,-0.000881,0.000624,2.172715e-04,0.000441,0.000512
29996,0.533823,-0.036677,0.736390,-0.175445,-0.391625,-0.315904,0.152372,0.179715,0.059714,0.273634,...,0.000963,0.001214,0.001331,0.001206,-0.000251,-0.001056,0.000610,7.568111e-04,-0.000815,-0.000343
29997,0.604973,-0.906314,0.430884,-0.191885,0.127356,0.001590,-0.321392,-0.018722,0.203163,-0.704567,...,-0.001394,-0.000808,0.000383,0.000245,-0.000247,0.000225,-0.000095,4.335275e-04,-0.000555,0.000016
29998,-0.357439,-0.945480,-0.226037,-0.022544,-0.198178,0.519456,0.940858,-0.081188,-0.606621,0.150328,...,-0.000260,0.001118,0.000822,0.000456,-0.000517,0.001362,-0.000098,3.737469e-04,0.000058,-0.000362


In [47]:
x_test_q = qt.transform(x_test)
x_test_pca = pca.transform(x_test_q)
x_test = pd.DataFrame(x_test_pca, index=x_test.index, columns=x_test.columns)
x_test

Unnamed: 0,amount_rur,min,max,mean,std,median,small_group_0,small_group_1,small_group_2,small_group_3,...,small_group_190,small_group_191,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_200,small_group_202
0,-0.323566,-0.952893,-0.341474,0.470961,-0.542213,-0.676000,-0.351316,-0.022591,0.010323,-0.160189,...,0.001097,0.001509,-0.000239,0.000218,-0.000805,0.000245,0.000228,-0.000528,-0.000279,0.000182
1,1.465315,0.235186,0.830920,-0.413691,-1.391389,-0.732697,0.108948,0.577110,0.747819,0.474863,...,-0.000203,-0.000970,-0.002247,-0.000114,-0.000174,-0.002070,-0.001111,-0.000484,-0.000033,0.001098
2,-0.043317,-0.765519,-0.715493,0.040745,0.023031,0.787362,-0.594561,-0.150240,0.286523,-0.591446,...,-0.000094,0.000119,-0.001291,0.001004,-0.000628,-0.000249,-0.000899,-0.000510,0.000476,0.000491
3,0.280984,-0.792924,-0.174370,-0.295852,-0.794862,-0.196353,0.670175,0.270288,-0.035511,0.909812,...,-0.000032,-0.000470,-0.000231,-0.000176,0.000109,0.000305,-0.000723,0.000680,0.000891,0.000168
4,0.753490,0.730819,-1.108887,-0.131872,-0.021237,0.401859,0.725720,0.410197,-0.721629,0.227947,...,-0.001140,-0.000334,0.000898,-0.001168,0.000346,0.000979,-0.000278,0.000386,-0.000076,-0.000198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.004759,0.206075,0.229791,-0.170527,-1.126021,0.092039,0.391404,0.478269,-0.315829,0.069720,...,-0.001006,-0.000809,-0.000524,-0.000904,0.000262,0.000112,-0.000737,0.000707,0.000183,0.000867
19996,-0.172251,-1.341429,0.653445,-0.446015,-0.066372,0.524419,0.035239,0.419492,-0.238754,0.215668,...,0.001044,0.000999,0.000617,-0.000104,-0.001474,0.000341,0.001013,-0.006203,-0.000038,0.000601
19997,-1.095629,0.584785,2.017268,0.556660,-0.039211,-0.313786,0.495197,-0.631891,-0.069236,-0.562778,...,0.001955,-0.000932,0.001000,-0.000731,0.000145,0.000010,0.000492,-0.000236,-0.000088,-0.000017
19998,0.548787,0.229574,0.669073,0.577147,0.644641,0.676952,0.295551,-0.209577,-0.751253,-0.034911,...,-0.000907,0.000238,0.000155,-0.001374,0.000051,-0.000674,-0.000006,0.000695,-0.000244,0.000272


In [48]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=1337, test_size=0.1)

In [50]:
%%time
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
parameters = {
    "iterations": 5000,
    "learning_rate": 0.01,
    "depth": 3,
    "task_type": "GPU",
    "gpu_ram_part": 0.8,
    "eval_metric": "Accuracy",
    "use_best_model": True,
    "random_seed": 1337,
    "classes_count": 4
}
model = CatBoostClassifier(**parameters)
model.fit(x_train, y_train, eval_set=(x_val, y_val))
round(accuracy_score(y_val, model.predict(x_val)) * 100, 2)

0:	learn: 0.4895926	test: 0.4840000	best: 0.4840000 (0)	total: 3.84ms	remaining: 19.2s
1:	learn: 0.4899259	test: 0.4846667	best: 0.4846667 (1)	total: 6.86ms	remaining: 17.1s
2:	learn: 0.4919259	test: 0.4866667	best: 0.4866667 (2)	total: 9.87ms	remaining: 16.4s
3:	learn: 0.4923333	test: 0.4856667	best: 0.4866667 (2)	total: 12.9ms	remaining: 16.1s
4:	learn: 0.4923333	test: 0.4843333	best: 0.4866667 (2)	total: 16ms	remaining: 16s
5:	learn: 0.4830370	test: 0.4770000	best: 0.4866667 (2)	total: 19ms	remaining: 15.8s
6:	learn: 0.4921852	test: 0.4866667	best: 0.4866667 (2)	total: 22ms	remaining: 15.7s
7:	learn: 0.4917778	test: 0.4880000	best: 0.4880000 (7)	total: 25ms	remaining: 15.6s
8:	learn: 0.4965556	test: 0.4913333	best: 0.4913333 (8)	total: 28.1ms	remaining: 15.6s
9:	learn: 0.4973704	test: 0.4936667	best: 0.4936667 (9)	total: 31.1ms	remaining: 15.5s
10:	learn: 0.4942963	test: 0.4903333	best: 0.4936667 (9)	total: 34.1ms	remaining: 15.5s
11:	learn: 0.4964444	test: 0.4936667	best: 0.4936667

59.47