In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Targets

In [2]:
%%time

import bisect
import numpy as np

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

CPU times: user 169 ms, sys: 40.4 ms, total: 209 ms
Wall time: 239 ms


## Add features

In [3]:
%%time
X_embeddings0 = pd.read_csv('./data/coles_finetuned_split_0.csv')
X_embeddings1 = pd.read_csv('./data/coles_finetuned_split_1.csv')
X_embeddings2 = pd.read_csv('./data/coles_finetuned_split_2.csv')
X_embeddings3 = pd.read_csv('./data/coles_finetuned_split_3.csv')
X_embeddings4 = pd.read_csv('./data/coles_finetuned_split_4.csv')

X_embeddings0.columns = list(map(lambda x: x + "_0" if x.startswith('embed') else x, X_embeddings0.columns))
X_embeddings1.columns = list(map(lambda x: x + "_1" if x.startswith('embed') else x, X_embeddings1.columns))
X_embeddings2.columns = list(map(lambda x: x + "_2" if x.startswith('embed') else x, X_embeddings2.columns))
X_embeddings3.columns = list(map(lambda x: x + "_3" if x.startswith('embed') else x, X_embeddings3.columns))
X_embeddings4.columns = list(map(lambda x: x + "_4" if x.startswith('embed') else x, X_embeddings4.columns))


X_aggregates = pd.read_csv('./data/aggregates_pivot.csv')
X_factors = pd.read_csv('./data/user_factors_128.csv')
X_mlm = pd.read_csv('./data/mlm_512.csv')

# X_sup = pd.read_csv('./data/coles_sup.csv')
# X_sup.columns = list(map(lambda x: x + "_sup" if x.startswith('embed') else x, X_sup.columns))
X_tab = pd.read_csv('./data/tabformer.csv')


X = df_public
X = X.merge(X_embeddings0, on="user_id", how='inner')
X = X.merge(X_embeddings1, on="user_id", how='inner')
X = X.merge(X_embeddings2, on="user_id", how='inner')
X = X.merge(X_embeddings3, on="user_id", how='inner')
X = X.merge(X_embeddings4, on="user_id", how='inner')

#X = X.merge(X_sup, on="user_id", how='inner')
X = X.merge(X_tab, on="user_id", how='inner')


X = X.merge(X_mlm, on="user_id", how='inner')
X = X.merge(X_factors, on="user_id", how='left').fillna(0)
X = X.merge(X_aggregates, on="user_id", how='inner')

del X['user_id'], X['age'], X['is_male']

CPU times: user 8min 55s, sys: 1min 58s, total: 10min 54s
Wall time: 11min 26s


In [4]:
X.head()

Unnamed: 0,embed_0_0,embed_1_0,embed_2_0,embed_3_0,embed_4_0,embed_5_0,embed_6_0,embed_7_0,embed_8_0,embed_9_0,...,Чукотский АО_y,Ямало-Ненецкий АО_y,Ярославская область_y,phablet,plain,smartphone,tablet,Android,Apple iOS,iOS
0,-0.122672,-0.999904,-0.119468,-0.871612,-0.007279,-0.462623,-0.150636,-0.105602,-0.602177,-0.498653,...,0.0,0.0,0.0,0.0,0.0,193.0,0.0,193.0,0.0,0.0
1,0.092888,-0.997484,-0.07478,-0.995615,0.005366,-0.857133,-0.35626,-0.167777,-0.777232,-0.622301,...,0.0,0.0,0.0,0.0,0.0,1047.0,0.0,1047.0,0.0,0.0
2,-0.1627,-0.963253,-0.240695,-0.999872,0.00932,-0.423705,-0.79975,-0.159654,-0.804487,0.277693,...,0.0,0.0,0.0,0.0,0.0,411.0,0.0,411.0,0.0,0.0
3,-0.077096,-0.591143,-0.364717,-0.931921,-0.007163,-0.722793,-0.16189,-0.1515,-0.886268,-0.43794,...,0.0,0.0,0.0,0.0,0.0,275.0,0.0,275.0,0.0,0.0
4,-0.032359,-0.996381,0.147097,-0.998264,0.002351,-0.112643,-0.819566,-0.249439,-0.7735,-0.77286,...,0.0,0.0,0.0,0.0,0.0,777.0,0.0,777.0,0.0,0.0


## Gender

In [22]:
cat_features =  ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']

In [23]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 0)

clf_gender = CatBoostClassifier(
    iterations=5000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42,
    task_type="GPU", 
    devices='0:1')

In [24]:
clf_gender.fit(x_train, y_train, metric_period=100, early_stopping_rounds=100, eval_set=(x_test_gender, y_test_gender), cat_features=cat_features)

Learning rate set to 0.02297
0:	learn: 0.6790017	test: 0.6795982	best: 0.6795982 (0)	total: 67.9ms	remaining: 5m 39s
100:	learn: 0.4140102	test: 0.4279079	best: 0.4279079 (100)	total: 6.15s	remaining: 4m 58s
200:	learn: 0.4064506	test: 0.4220849	best: 0.4220849 (200)	total: 12.3s	remaining: 4m 52s
300:	learn: 0.4039787	test: 0.4208190	best: 0.4208190 (300)	total: 18.2s	remaining: 4m 43s
400:	learn: 0.4020856	test: 0.4201670	best: 0.4201670 (400)	total: 23.9s	remaining: 4m 34s
500:	learn: 0.4004906	test: 0.4195583	best: 0.4195385 (497)	total: 29.6s	remaining: 4m 25s
600:	learn: 0.3989273	test: 0.4192272	best: 0.4192272 (600)	total: 35.1s	remaining: 4m 16s
700:	learn: 0.3973750	test: 0.4189225	best: 0.4189225 (700)	total: 40.6s	remaining: 4m 8s
800:	learn: 0.3959797	test: 0.4187508	best: 0.4187149 (787)	total: 46s	remaining: 4m 1s
900:	learn: 0.3946265	test: 0.4185835	best: 0.4185620 (886)	total: 51.7s	remaining: 3m 55s
1000:	learn: 0.3933784	test: 0.4184308	best: 0.4184308 (1000)	total:

<catboost.core.CatBoostClassifier at 0x7fd58690a910>

In [25]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.781


# Age

In [29]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 0)

clf_age = CatBoostClassifier(iterations=5000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42,
    task_type="GPU", 
    devices='0:1')

CPU times: user 18.8 s, sys: 22.7 s, total: 41.5 s
Wall time: 41.5 s


In [30]:
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age), early_stopping_rounds=100, cat_features=cat_features)

Learning rate set to 0.090642
0:	learn: 1.8340702	test: 1.8354721	best: 1.8354721 (0)	total: 240ms	remaining: 19m 59s
100:	learn: 1.1932220	test: 1.2140742	best: 1.2140742 (100)	total: 18.2s	remaining: 14m 43s
200:	learn: 1.1734614	test: 1.2051717	best: 1.2051717 (200)	total: 35.8s	remaining: 14m 13s
300:	learn: 1.1595028	test: 1.2015165	best: 1.2015165 (300)	total: 52.7s	remaining: 13m 43s
400:	learn: 1.1476223	test: 1.1990616	best: 1.1990616 (400)	total: 1m 9s	remaining: 13m 16s
500:	learn: 1.1370048	test: 1.1976479	best: 1.1976479 (500)	total: 1m 25s	remaining: 12m 50s
600:	learn: 1.1270815	test: 1.1965501	best: 1.1965501 (600)	total: 1m 42s	remaining: 12m 28s
700:	learn: 1.1175003	test: 1.1957686	best: 1.1957686 (700)	total: 1m 58s	remaining: 12m 7s
800:	learn: 1.1080121	test: 1.1952501	best: 1.1952091 (788)	total: 2m 14s	remaining: 11m 47s
900:	learn: 1.0995073	test: 1.1944556	best: 1.1944381 (899)	total: 2m 30s	remaining: 11m 26s
1000:	learn: 1.0908983	test: 1.1940210	best: 1.193

<catboost.core.CatBoostClassifier at 0x7fd5a24d1190>

In [31]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                             target_names = ['<18','18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       113
       18-25       0.56      0.46      0.50      3300
       25-34       0.55      0.62      0.58      8690
       35-44       0.45      0.55      0.50      7748
       45-54       0.41      0.32      0.36      4275
       55-65       0.45      0.32      0.38      2343
         65+       0.43      0.04      0.08       531

    accuracy                           0.49     27000
   macro avg       0.41      0.33      0.34     27000
weighted avg       0.49      0.49      0.48     27000



In [32]:
0.784 + 2*0.48

1.744

# Train on full dataset

In [13]:
%%time
clf_gender = CatBoostClassifier(iterations=3000,
                                random_seed=3, 
                                task_type="GPU", 
                                devices='0:1')
not_na_gender = (y_gender != 'NA') & (y_gender != None)
clf_gender.fit(X[not_na_gender], y_gender[not_na_gender], metric_period=100, cat_features=cat_features)

clf_age = CatBoostClassifier(iterations=3000, 
                             random_seed=3, 
                             task_type="GPU", 
                             devices='0:1')
not_na_age = ~np.isnan(y_age)
clf_age.fit(X[not_na_age], y_age[not_na_age], metric_period=100, cat_features=cat_features)

Learning rate set to 0.025482
0:	learn: 0.6781661	total: 814ms	remaining: 1h 7m 48s
200:	learn: 0.4080304	total: 2m 6s	remaining: 50m 16s
300:	learn: 0.4053764	total: 3m 5s	remaining: 48m 15s
400:	learn: 0.4033105	total: 4m 5s	remaining: 46m 58s
500:	learn: 0.4012767	total: 5m 3s	remaining: 45m 23s
600:	learn: 0.3987184	total: 6m	remaining: 43m 57s
700:	learn: 0.3964618	total: 6m 56s	remaining: 42m 31s
800:	learn: 0.3943319	total: 7m 52s	remaining: 41m 15s
900:	learn: 0.3924352	total: 8m 44s	remaining: 39m 45s
1000:	learn: 0.3906321	total: 9m 35s	remaining: 38m 20s
1100:	learn: 0.3889595	total: 10m 26s	remaining: 36m 59s
1200:	learn: 0.3873098	total: 11m 17s	remaining: 35m 43s
1300:	learn: 0.3856382	total: 12m 8s	remaining: 34m 31s
1400:	learn: 0.3841834	total: 12m 57s	remaining: 33m 18s
1500:	learn: 0.3825928	total: 13m 49s	remaining: 32m 12s
1600:	learn: 0.3810565	total: 14m 38s	remaining: 31m 4s
1700:	learn: 0.3795614	total: 15m 28s	remaining: 30m 1s
1800:	learn: 0.3780708	total: 16

<catboost.core.CatBoostClassifier at 0x7fd66fb99160>

# Score submit 

In [14]:
%%time

df_submit = pq.read_table('data/submit_2.pqt').to_pandas().sort_values(by='user_id')

X_submit = df_submit

X_submit = X_submit.merge(X_embeddings0, on="user_id", how='left')
X_submit = X_submit.merge(X_embeddings1, on="user_id", how='left')
X_submit = X_submit.merge(X_embeddings2, on="user_id", how='left')
X_submit = X_submit.merge(X_embeddings3, on="user_id", how='left')
X_submit = X_submit.merge(X_embeddings4, on="user_id", how='left')

# X_submit = X_submit.merge(X_sup, on="user_id", how='left')
X_submit = X_submit.merge(X_tab, on="user_id", how='left')

X_submit = X_submit.merge(X_mlm, on="user_id", how='left')
X_submit = X_submit.merge(X_factors, on="user_id", how='left').fillna(0)
X_submit = X_submit.merge(X_aggregates, on="user_id", how='inner')

CPU times: user 32.5 s, sys: 40 s, total: 1min 12s
Wall time: 1min 12s


In [15]:
list(X_submit['user_id']) == list(df_submit['user_id'])

True

In [16]:
del X_submit['user_id']

In [33]:
df_submit['age'] = clf_age.predict(X_submit)
df_submit['is_male'] = clf_gender.predict_proba(X_submit)[:, 1]
df_submit.head()

Unnamed: 0,user_id,age,is_male
0,6,2,0.067422
7,7,3,0.949316
9,9,2,0.058905
10,10,3,0.02865
4,11,5,0.890009


In [34]:
df_submit.to_csv(f'data/submissions/submission_pivot2.csv', index = False)

In [21]:
df_submit.head()

Unnamed: 0,user_id,age,is_male
0,6,2,0.0699
7,7,2,0.936878
9,9,2,0.062676
10,10,3,0.035354
4,11,5,0.893796


In [26]:
df_submit.loc[df_submit.age==0, 'age'] = 1

In [27]:
df_submit = pd.read_csv(f'data/submissions/submission_split3.csv')

In [30]:
df_submit.loc[df_submit.age==0, 'age']

Series([], Name: age, dtype: int64)

## Blend all submissions

In [131]:
!ls data/submissions/all

submission.csv		   submission_cat4.csv	     submission_pivot.csv
submission_3_layers.csv    submission_cat5.csv	     submission_pivot2.csv
submission_blnd.csv	   submission_cat7.csv	     submission_pivot3.csv
submission_blnd1.csv	   submission_cat_suf_1.csv  submission_split1.csv
submission_blnd2.csv	   submission_cat_suf_3.csv  submission_split2.csv
submission_blnd3.csv	   submission_catboost.csv   submission_split3.csv
submission_blnd4.csv	   submission_danet1.csv     submission_total.csv
submission_blnd_final.csv  submission_danet2.csv     submission_xlarge.csv
submission_cat.csv	   submission_ft.csv	     submission_xlarge_1.csv
submission_cat1.csv	   submission_ft2.csv	     submission_xlarge_2.csv
submission_cat2.csv	   submission_ft3.csv
submission_cat3.csv	   submission_large.csv


In [132]:
import glob
submissions = glob.glob("data/submissions/all/*") 

In [133]:
from tqdm import tqdm

ages = []
genders = []

df_ids = pd.read_csv(f'data/submissions/submission.csv').sort_values(by='user_id')

for sub in tqdm(submissions):
    df = pd.read_csv(f'{sub}')
    assert(all(df['user_id'].values == df_ids['user_id'].values ))
    ages.append(df['age'].values)
    genders.append(df['is_male'].values)

100%|███████████████████████████████████████████████████████████████████████████████| 34/34 [00:01<00:00, 28.86it/s]


In [134]:
np.stack(ages).T.shape

(144724, 34)

In [135]:
ages_amax = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=np.stack(ages).T)

In [136]:
gender_avg = np.apply_along_axis(lambda x: np.mean(x), axis=1, arr=np.stack(genders).T)

In [137]:
df_ids['age'] = ages_amax
df_ids['is_male'] = gender_avg

In [138]:
df_ids.to_csv(f'data/submissions/submission_final_max.csv', index = False)