In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

## Targets

In [2]:
%%time

import bisect
import numpy as np

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')
df_submit = pq.read_table('data/submit_2.pqt').to_pandas().sort_values(by='user_id')

CPU times: user 95.6 ms, sys: 14.6 ms, total: 110 ms
Wall time: 88.9 ms


## Features

In [3]:
%%time
df_emb0 = pd.read_csv('./data/coles_finetuned_split_0.csv')
df_emb1 = pd.read_csv('./data/coles_finetuned_split_1.csv')
df_emb2 = pd.read_csv('./data/coles_finetuned_split_2.csv')
df_emb3 = pd.read_csv('./data/coles_finetuned_split_3.csv')
df_emb4 = pd.read_csv('./data/coles_finetuned_split_4.csv')
df_agg = pd.read_csv('./data/aggregates_pivot.csv')
df_fac = pd.read_csv('./data/user_factors.csv')
df_mlm = pd.read_csv('./data/mlm_512.csv')
df_tab = pd.read_csv('./data/tabformer.csv')
df_sup = pd.read_csv('./data/coles_sup.csv')
df_suf = pd.read_csv('./data/coles_512_shuffle.csv')


data = [df_emb0, df_emb1, df_emb2, df_emb3, df_emb4, df_fac, df_tab, df_sup, df_suf]

CPU times: user 7min 42s, sys: 41.1 s, total: 8min 23s
Wall time: 8min 23s


## Gender blended featues

In [5]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_predict

def get_blender_features(df_public, df_submit, df_features, fname):
    y = df_public['age_q'].astype(int) + 7 * df_public['is_male'].astype(int)
    X = df_public.merge(df_features, on="user_id", how='left')
    user_ids = X['user_id']
    
    del X['user_id'], X['age'], X['age_q'], X['is_male']
    
    
    clf_gender = CatBoostClassifier(iterations=3000, random_seed=42, 
                                    metric_period=1000, 
                                    logging_level='Silent',
                                    task_type="GPU", 
                                    devices='0:1'
                                   )
    
    # Cross val predict 
    pred = cross_val_predict(clf_gender, X, y, cv=3, method='predict_proba')
    
    df_train = pd.DataFrame(pred, columns=[f"{fname}_{i}" for i in range(pred.shape[1])])
    df_train['user_id'] = user_ids
    
    # Fit clf
    clf_gender.fit(X, y)
    
    # Predict submit
    X = df_submit.merge(df_features, on="user_id", how='left').fillna(0)
    user_ids = X['user_id']
    del X['user_id']
    
    pred = clf_gender.predict_proba(X)
    df_test = pd.DataFrame(pred, columns=[f"{fname}_{i}" for i in range(pred.shape[1])])
    df_test['user_id'] = user_ids
       
    return df_train, df_test

In [6]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

# Fill na with most common age and gender
df_public['age_q'] = list(map(age_bucket, df_public['age']))
df_public.loc[(df_public['age_q'] == 'NA') | df_public['age_q'].isnull(), 'age_q']  = 2
df_public.loc[(df_public['is_male'] == 'NA') | df_public['is_male'].isnull(), 'is_male'] = 1 

df_public_blend = df_public.copy()[['user_id']]
df_submit_blend = df_submit.copy()[['user_id']]

df_public_blend = df_public_blend.merge(df_agg, on='user_id', how='left')   
df_submit_blend = df_submit_blend.merge(df_agg, on='user_id', how='left')   

for i, d in tqdm(enumerate(data)):
    df_train, df_test = get_blender_features(df_public, df_submit, d, fname=f"f{i}")
    df_public_blend = df_public_blend.merge(df_train, on='user_id', how='left')
    df_submit_blend = df_submit_blend.merge(df_test, on='user_id', how='left')

9it [1:24:54, 566.05s/it]


In [7]:
df_public_blend.to_csv("./data/public_blend_final.csv", index=False)
df_submit_blend.to_csv("./data/submit_blend_final.csv", index=False)

# Downstream

In [8]:
cat_features = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']

# Gender

In [9]:
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')


X = df_public.merge(df_public_blend, on="user_id", how='left')

del X['user_id'], X['age'], X['is_male']

In [10]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

In [11]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 0)

clf_gender = CatBoostClassifier(
    iterations=5000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42, 
    task_type="GPU", devices='0:1')

In [12]:
clf_gender.fit(x_train, y_train, metric_period=100, early_stopping_rounds=100, eval_set=(x_test_gender, y_test_gender), cat_features=cat_features)

Learning rate set to 0.02297
0:	learn: 0.6787711	test: 0.6789478	best: 0.6789478 (0)	total: 32.2ms	remaining: 2m 40s
100:	learn: 0.4090546	test: 0.4206461	best: 0.4206461 (100)	total: 2.63s	remaining: 2m 7s
200:	learn: 0.4021053	test: 0.4150179	best: 0.4150179 (200)	total: 5.17s	remaining: 2m 3s
300:	learn: 0.4001286	test: 0.4137307	best: 0.4137307 (300)	total: 7.72s	remaining: 2m
400:	learn: 0.3989285	test: 0.4131698	best: 0.4131675 (399)	total: 10.3s	remaining: 1m 58s
500:	learn: 0.3978109	test: 0.4127446	best: 0.4127446 (500)	total: 12.9s	remaining: 1m 55s
600:	learn: 0.3967959	test: 0.4124665	best: 0.4124595 (591)	total: 15.3s	remaining: 1m 52s
700:	learn: 0.3957652	test: 0.4122551	best: 0.4122551 (700)	total: 17.7s	remaining: 1m 48s
800:	learn: 0.3949396	test: 0.4121038	best: 0.4121013 (798)	total: 20s	remaining: 1m 44s
900:	learn: 0.3942261	test: 0.4121566	best: 0.4120743 (823)	total: 22.5s	remaining: 1m 42s
bestTest = 0.4120743144
bestIteration = 823
Shrink model to first 824 it

<catboost.core.CatBoostClassifier at 0x7f026ba02b20>

In [13]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.787


# Age

In [14]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age) 
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 0)

clf_age = CatBoostClassifier(iterations=5000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42, task_type="GPU", devices='0:1')

CPU times: user 4.6 s, sys: 3.57 s, total: 8.17 s
Wall time: 7.12 s


In [15]:
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age), early_stopping_rounds=100, cat_features=cat_features)

Learning rate set to 0.090642
0:	learn: 1.8256770	test: 1.8267010	best: 1.8267010 (0)	total: 53ms	remaining: 4m 24s
100:	learn: 1.1802840	test: 1.1980619	best: 1.1980619 (100)	total: 3.11s	remaining: 2m 30s
200:	learn: 1.1696044	test: 1.1936234	best: 1.1936234 (200)	total: 5.91s	remaining: 2m 21s
300:	learn: 1.1627221	test: 1.1917731	best: 1.1917731 (300)	total: 8.64s	remaining: 2m 14s
400:	learn: 1.1573185	test: 1.1908105	best: 1.1908105 (400)	total: 11.4s	remaining: 2m 10s
500:	learn: 1.1528642	test: 1.1901746	best: 1.1901746 (500)	total: 14s	remaining: 2m 5s
600:	learn: 1.1489613	test: 1.1897725	best: 1.1897710 (598)	total: 16.5s	remaining: 2m
700:	learn: 1.1453607	test: 1.1893414	best: 1.1893374 (698)	total: 19.1s	remaining: 1m 56s
800:	learn: 1.1420315	test: 1.1890541	best: 1.1890378 (798)	total: 21.5s	remaining: 1m 52s
900:	learn: 1.1388866	test: 1.1889693	best: 1.1889675 (896)	total: 24.1s	remaining: 1m 49s
1000:	learn: 1.1358632	test: 1.1888130	best: 1.1887883 (991)	total: 26.6

<catboost.core.CatBoostClassifier at 0x7f02c00ff700>

In [17]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18','18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       113
       18-25       0.57      0.45      0.51      3300
       25-34       0.55      0.63      0.59      8690
       35-44       0.45      0.56      0.50      7748
       45-54       0.42      0.31      0.36      4275
       55-65       0.44      0.33      0.38      2343
         65+       0.62      0.04      0.08       531

    accuracy                           0.50     27000
   macro avg       0.44      0.33      0.34     27000
weighted avg       0.49      0.50      0.49     27000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
0.788 + 2*0.48

# Train on full dataset

In [18]:
%%time
clf_gender = CatBoostClassifier(iterations=3000,random_seed=0, task_type="GPU", devices='0:1')
not_na_gender = (y_gender != 'NA') & (y_gender != None)
clf_gender.fit(X[not_na_gender], y_gender[not_na_gender], metric_period=100, cat_features=cat_features)

Learning rate set to 0.009269
0:	learn: 0.6872385	total: 26.9ms	remaining: 1m 20s
100:	learn: 0.4499512	total: 2.48s	remaining: 1m 11s
200:	learn: 0.4151900	total: 4.81s	remaining: 1m 6s
300:	learn: 0.4071764	total: 7.18s	remaining: 1m 4s
400:	learn: 0.4043465	total: 9.66s	remaining: 1m 2s
500:	learn: 0.4028827	total: 12.2s	remaining: 1m
600:	learn: 0.4019410	total: 14.8s	remaining: 59s
700:	learn: 0.4012853	total: 17.3s	remaining: 56.6s
800:	learn: 0.4007371	total: 19.8s	remaining: 54.2s
900:	learn: 0.4003217	total: 22.2s	remaining: 51.8s
1000:	learn: 0.3999302	total: 24.7s	remaining: 49.4s
1100:	learn: 0.3995715	total: 27.2s	remaining: 46.9s
1200:	learn: 0.3992247	total: 29.7s	remaining: 44.4s
1300:	learn: 0.3988823	total: 32s	remaining: 41.7s
1400:	learn: 0.3984853	total: 34.4s	remaining: 39.2s
1500:	learn: 0.3980644	total: 36.8s	remaining: 36.7s
1600:	learn: 0.3976859	total: 39.1s	remaining: 34.2s
1700:	learn: 0.3973236	total: 41.4s	remaining: 31.6s
1800:	learn: 0.3970120	total: 43

<catboost.core.CatBoostClassifier at 0x7f0281d5cd90>

In [19]:
%%time
clf_age = CatBoostClassifier(iterations=3000, random_seed=0, task_type="GPU", devices='0:1')
not_na_age = ~np.isnan(y_age) & (y_age != 0)
clf_age.fit(X[not_na_age], y_age[not_na_age], metric_period=100, cat_features=cat_features)

Learning rate set to 0.081374
0:	learn: 1.7083706	total: 40.9ms	remaining: 2m 2s
100:	learn: 1.1675246	total: 2.87s	remaining: 1m 22s
200:	learn: 1.1580254	total: 5.46s	remaining: 1m 16s
300:	learn: 1.1521440	total: 8.06s	remaining: 1m 12s
400:	learn: 1.1476413	total: 10.5s	remaining: 1m 7s
500:	learn: 1.1434085	total: 12.9s	remaining: 1m 4s
600:	learn: 1.1398736	total: 15.3s	remaining: 1m
700:	learn: 1.1364936	total: 17.7s	remaining: 57.9s
800:	learn: 1.1336724	total: 20s	remaining: 54.9s
900:	learn: 1.1306032	total: 22.3s	remaining: 52s
1000:	learn: 1.1280392	total: 24.6s	remaining: 49.1s
1100:	learn: 1.1254274	total: 26.8s	remaining: 46.2s
1200:	learn: 1.1230501	total: 29.1s	remaining: 43.6s
1300:	learn: 1.1204887	total: 31.4s	remaining: 41s
1400:	learn: 1.1181876	total: 33.7s	remaining: 38.4s
1500:	learn: 1.1159490	total: 36s	remaining: 36s
1600:	learn: 1.1133869	total: 38.3s	remaining: 33.5s
1700:	learn: 1.1109227	total: 40.6s	remaining: 31s
1800:	learn: 1.1086981	total: 42.9s	rem

<catboost.core.CatBoostClassifier at 0x7f026d606a90>

# Score submit 

In [20]:
%%time

df_submit = pq.read_table('data/submit_2.pqt').to_pandas().sort_values(by='user_id')
X_submit = df_submit.merge(df_submit_blend, on="user_id")

CPU times: user 3.18 s, sys: 2.8 s, total: 5.98 s
Wall time: 4.63 s


In [21]:
list(X_submit['user_id']) == list(df_submit['user_id'])

True

In [22]:
del X_submit['user_id']

In [23]:
df_submit['age'] = clf_age.predict(X_submit)
df_submit['is_male'] = clf_gender.predict_proba(X_submit)[:, 1]
df_submit.head()

Unnamed: 0,user_id,age,is_male
0,6,2,0.084305
7,7,3,0.933996
9,9,2,0.063537
10,10,3,0.031888
4,11,5,0.927905


In [24]:
df_submit.to_csv(f'data/submission_blnd_final.csv', index = False)

In [25]:
df_submit['age'].value_counts()

2    52885
3    51510
4    16992
1    13809
5     9273
6      255
Name: age, dtype: int64

In [68]:
sum(df_public.age < 18)

351