In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Handcrafted aggreagates

In [2]:
%%time
df = pq.read_table('data/competition_data_final_pqt').to_pandas()

CPU times: user 1min 48s, sys: 1min 53s, total: 3min 42s
Wall time: 1min 12s


In [3]:
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


In [4]:
df.price = df.price.fillna(0)
df.price = pd.qcut(df.price, 10, labels=False)

In [None]:
%%time

import numpy as np

def most_frequent(x):
    return x.mode()[0]

df['unique_devices'] = df['cpe_model_name']

names = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']
aggs = dict([(n, most_frequent) for n in names] + [('request_cnt', np.sum), ('unique_devices', lambda x: x.nunique())])
df_agg = df.groupby(['user_id']).agg(aggs).reset_index().sort_values(by='user_id')

df_agg.part_of_day.value_counts()

In [None]:
df_agg.head()

In [None]:
df_agg.unique_devices.value_counts()

## Gender hollidays

In [None]:
%%time

from datetime import datetime

def gh(d):
    d = d.strftime('%m-%d')
    if d == '03-08':
        return '8mar'
    if d == '02-23':
        return '23feb'
    return None

df['gh'] = df['date'].apply(gh) 

In [None]:
df['gh'].value_counts()

In [None]:
%%time

df_gh = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['gh'], aggfunc=np.sum).reset_index().fillna(0)
df_gh.columns = df_gh.columns.get_level_values(0)

total_cnt_8_23 = df_gh['8mar'] + df_gh['23feb'] + 1.
df_gh['8mar']  /= total_cnt_8_23
df_gh['23feb'] /= total_cnt_8_23

df_gh.head(10)

## Part of day

In [None]:
%%time

df_pod = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['part_of_day'], aggfunc=np.sum).reset_index().fillna(0)
df_pod.columns = df_pod.columns.get_level_values(0)

In [None]:
total_pod = df_pod['day'] + df_pod['evening'] + df_pod['morning'] + df_pod['night'] + 1
df_pod['day'] /= total_pod
df_pod['evening'] /= total_pod
df_pod['morning'] /= total_pod
df_pod['night'] /= total_pod

df_pod.head(10)

## Pivots

In [None]:
from tqdm import tqdm
cols = ["region_name", "city_name", "cpe_manufacturer_name", "cpe_model_name", "cpe_type_cd", "cpe_model_os_type"]
for c in tqdm(cols):
    print(c + ":", len(set(df[c])))

In [None]:
%%time
df_region = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['region_name'], aggfunc=np.mean).reset_index().fillna(0)
df_city = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['city_name'], aggfunc=np.mean.reset_index().fillna(0)
df_manuf = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['cpe_manufacturer_name'], aggfunc=np.mean).reset_index().fillna(0)
df_model = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['cpe_model_name'], aggfunc=np.mean).reset_index().fillna(0)
df_type = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['cpe_type_cd'], aggfunc=np.mean).reset_index().fillna(0)
df_os = pd.pivot_table(df, values='request_cnt', index=['user_id'], columns=['cpe_model_os_type'], aggfunc=np.mean).reset_index().fillna(0)

## Merge

In [None]:
%%time
df_all = df_agg.merge(df_gh, on='user_id', how='left')
df_all = df_all.merge(df_pod, on='user_id', how='left').fillna(-1)


df_all = df_all.merge(df_region, on='user_id', how='left')
df_all = df_all.merge(df_city, on='user_id', how='left')
df_all = df_all.merge(df_manuf, on='user_id', how='left')
df_all = df_all.merge(df_model, on='user_id', how='left')
df_all = df_all.merge(df_type, on='user_id', how='left')
df_all = df_all.merge(df_os, on='user_id', how='left')


df_all.head()

In [None]:
df_all.shape

In [None]:
df_all.to_csv('./data/aggregates_pivot.csv', index=False)

## Downstream

In [None]:
%%time

import bisect
import numpy as np

df_embeds = pd.read_csv('./data/aggregates_pivot.csv')
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

X = df_public
X = X.merge(df_embeds, on="user_id", how='inner')
del X['user_id'], X['age'], X['is_male']

## Gender

In [None]:
cat_features = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']

In [None]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42,
    task_type="GPU", 
    devices='0:1')
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender), cat_features=cat_features)

In [None]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

## Age

In [None]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42,
    task_type="GPU", 
    devices='0:1')
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age), cat_features=cat_features)

In [None]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

In [None]:
import pandas as pd
df_embeds = pd.read_csv('./data/aggregates_pivot.csv')

In [None]:
df_embeds.head()

In [None]:
(df_embeds['Чукотский АО_y']>0).value_counts()