In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import joblib as jb

import json
import tqdm

import numba
import dask
import xgboost
from dask.diagnostics import ProgressBar
ProgressBar().register()

## Carrega os df's estruturados de view


In [None]:
train = pd.read_parquet("./data/22_train_view_melted.parquet")
train.head()

In [None]:
test = pd.read_parquet("./data/22_test_view_melted.parquet")
test.head()

## Gera três dicionarios em que a chave é o item_id e os valores são title, price e domain_id

In [None]:
item_data = pd.read_parquet("./data/item_data.parquet")
item_data.head()

item_title_map = item_data[['item_id', 'title']].drop_duplicates()
item_title_map = item_title_map.set_index("item_id").squeeze().to_dict()

item_price_map = item_data[['item_id', 'price']].drop_duplicates()
item_price_map = item_price_map.set_index("item_id").squeeze().to_dict()

item_domain_map = item_data[['item_id', 'domain_id']].drop_duplicates()
item_domain_map = item_domain_map.set_index("item_id").squeeze().to_dict()

## Importa df's de busca e adiciona features

In [None]:
train_search = pd.read_parquet("./data/22_train_search_melted.parquet")
train_search.head()

train_search['search_len'] = train_search['event_info'].str.len()
train_search['num_words'] = train_search['event_info'].apply(lambda x: len(x.split(" ")))

search_features = pd.DataFrame(index=train_search['seq_index'].unique())
search_features['n_searches'] = train_search.groupby('seq_index').size()
search_features['n_unique_searches'] = train_search.groupby('seq_index')['event_info'].nunique()
search_features['avg_search_seqpos'] = train_search.groupby('seq_index')['seq_pos'].mean()
search_features['avg_search_len'] = train_search.groupby('seq_index')['search_len'].mean()
search_features['avg_search_words'] = train_search.groupby('seq_index')['num_words'].mean()
search_features = search_features.reset_index().rename(columns={"index": "seq_index"})
search_features.head()

In [None]:
test_search = pd.read_parquet("./data/22_test_search_melted.parquet")
test_search.head()

test_search['search_len'] = test_search['event_info'].str.len()
test_search['num_words'] = test_search['event_info'].apply(lambda x: len(x.split(" ")))

search_features_test = pd.DataFrame(index=test_search['seq_index'].unique())
search_features_test['n_searches'] = test_search.groupby('seq_index').size()
search_features_test['n_unique_searches'] = test_search.groupby('seq_index')['event_info'].nunique()
search_features_test['avg_search_seqpos'] = test_search.groupby('seq_index')['seq_pos'].mean()
search_features_test['avg_search_len'] = test_search.groupby('seq_index')['search_len'].mean()
search_features_test['avg_search_words'] = test_search.groupby('seq_index')['num_words'].mean()
search_features_test = search_features_test.reset_index().rename(columns={"index": "seq_index"})
search_features_test.head()

# feature engineering


In [None]:
train['n_views'] = train.groupby("seq_index")['viewed'].transform('sum')
train['n_views_this'] = train.groupby(["seq_index", 'event_info'])['viewed'].transform("sum")
train['n_views_this_domain'] = train.groupby(["seq_index", 'item_domain'])['viewed'].transform("sum")
train['unique_items_viewed'] = train.groupby("seq_index")['event_info'].transform('nunique')
train['unique_domains_viewed'] = train.groupby("seq_index")['item_domain'].transform("nunique")
train['n_views_this_ratio'] = train['n_views_this'] / train['n_views']
train['n_views_this_ratio_domain'] = train['n_views_this'] / train['n_views_this_domain']

train2 = pd.merge(train, search_features, on='seq_index', how='left')


In [None]:
test['n_views'] = test.groupby("seq_index")['viewed'].transform('sum')
test['n_views_this'] = test.groupby(["seq_index", 'event_info'])['viewed'].transform("sum")
test['n_views_this_domain'] = test.groupby(["seq_index", 'item_domain'])['viewed'].transform("sum")
test['unique_items_viewed'] = test.groupby("seq_index")['event_info'].transform("nunique")
test['unique_domains_viewed'] = test.groupby("seq_index")['item_domain'].transform("nunique")

test['n_views_this_ratio'] = test['n_views_this'] / test['n_views']
test['n_views_this_ratio_domain'] = test['n_views_this'] / test['n_views_this_domain']

test2 = pd.merge(test, search_features_test, on='seq_index', how='left')

In [None]:
train2.drop_duplicates(subset=['seq_index', "event_info"], inplace=True)

In [None]:
test2.drop_duplicates(subset=['seq_index', "event_info"], inplace=True)

## Gera a variável resposta (rank)

1. Atribui 1 se o domain_id do item visto é o mesmo do item comprado, caso contrario, 0
2. +12 se o item visto é o comprado

In [None]:
train2['y_rank'] = (train2['bought_domain'] == train2['item_domain']).astype(int)
train2['y_rank'] += (train2['bought_id'] == train2['event_info']).astype(int) * 12

In [None]:
train2.to_parquet("./data/22c_train_melt_with_features.parquet", engine='fastparquet', compression=None)
test2.to_parquet("./data/22c_test_melt_with_features.parquet", engine='fastparquet', compression=None)

# stack gen

- **Extrai o domain_id dos best sellers**
- `pad`: se faltam k produtos pra completar o top 10, adiciona k a partir do best seller
- `pad_str`: idem `pad` só que para o domain_id
- `ndcg_vec`: calcula o ndcg@10 a partir de um vetor de predicao

In [None]:
%%time
log_pos = np.log1p(np.arange(1,11))
best_sellers = [1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574]
best_sellers_domain = [item_domain_map[e] for e in best_sellers]

def pad(lst):
    
    if len(lst) == 0:
        return best_sellers
    if len(lst) < 10:
        lst += best_sellers[:(10 - len(lst))]
    return np.array(lst)

def pad_str(lst):
    if len(lst) == 0:
        return best_sellers_domain
    if len(lst) < 10:
        lst += best_sellers_domain[:(10 - len(lst))]
    return lst

# this is wrong, double counts exact item hits
def ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain):
    relevance = np.zeros((ypred.shape[0], 10))
    for i in range(10):
        relevance[:, i] = np.equal(ypred_domain[:, i], ytrue_domain) * (np.equal(ypred[:, i], ytrue) * 12 + 1)
    dcg = (relevance / log_pos).sum(axis=1)

    i_relevance = np.ones(10)
    i_relevance[0] = 12.
    idcg = np.zeros(ypred.shape[0]) + (i_relevance / log_pos).sum()

    return (dcg / idcg).mean()

In [None]:
ts.shape,tr.shape

In [None]:
p.shape

In [None]:
stack_p.iloc[ts].shape

In [None]:
from sklearn.model_selection import GroupKFold
from cuml.preprocessing import TargetEncoder


kf = GroupKFold(n_splits=2)
stack_p = pd.DataFrame(index=train2.index, dtype=np.float64)
for tr, ts in kf.split(train2, groups=train2['seq_index']):
    Xtr = train2.iloc[tr]
    Xval = train2.iloc[ts]
    
    
    # ran once, for the first iteration of this loop, to reuse the "same" data in the stack
    #joblib.dump(Xtr['seq_index'].unique(), "./valid/fold1.pkl.z")
    #joblib.dump(Xval['seq_index'].unique(), "./valid/fold2.pkl.z")
    
    tgt_cuml = TargetEncoder(n_folds=5, smooth=5e-2)
    for c in ['item_domain', 'event_info']:
        Xtr[c+"_cuml"] = tgt_cuml.fit_transform(Xtr[c], Xtr['has_bought'])
        Xval[c+"_cuml"] = tgt_cuml.transform(Xval[c])

    features = ['item_price', 'seq_pos', 'n_views',
           'n_views_this', 'n_views_this_domain', 'unique_items_viewed',
           'unique_domains_viewed', 'item_domain_cuml', 'event_info_cuml', 'n_searches', 'n_unique_searches',
           'avg_search_seqpos', 'avg_search_len', 'avg_search_words', 'n_views_this_ratio', 'n_views_this_ratio_domain', 'viewed']

    params = [0.027652448846980884, 6, 1.5196450924014913, 0.15061222682840253, 0.4999203983793246]
    learning_rate, max_depth, min_child_weight, subsample, colsample_bytree = params


    Xtrr, ytr = Xtr[features], Xtr['y_rank']
    Xvall = Xval[features]
    
    # Tamanho da query
    groups = Xtr.groupby('seq_index').size().values

    mdl = xgboost.XGBRanker(seed=0, tree_method='gpu_hist', gpu_id=0, n_estimators=1000,
                               learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight,
                                subsample=subsample, colsample_bytree=colsample_bytree, objective='rank:pairwise')

    mdl.fit(Xtrr, ytr, group=groups)

    p = mdl.predict(Xvall)
    stack_p.iloc[ts] = p

    preds = Xval[['seq_index', 'has_bought', 'item_domain', 'bought_domain', 'event_info', 'bought_id']].copy()
    preds['p'] = p
    preds = preds.sort_values('p', ascending=False).drop_duplicates(subset=['seq_index', 'event_info'])

    ytrue = preds.groupby("seq_index")['bought_id'].apply(lambda x: x.iloc[0]).values
    ytrue_domain = preds.groupby("seq_index")['bought_domain'].apply(lambda x: x.iloc[0]).values

    ypred = preds.groupby("seq_index")['event_info'].apply(lambda x: pad(x.iloc[:10].tolist()))
    ypred = np.array(ypred.tolist())

    ypred_domain = preds.groupby("seq_index")['item_domain'].apply(lambda x: pad_str(x.iloc[:10].tolist()))
    ypred_domain = np.array(ypred_domain.tolist())

    print(ndcg_vec(ytrue, ypred, ytrue_domain, ypred_domain))
pd.DataFrame(stack_p, columns=['22c']).to_parquet("./stack_2f/22c.parquet", engine='fastparquet', compression=None)


# test

In [None]:
from cuml.preprocessing import TargetEncoder

#train3 = train2.drop_duplicates(subset=['seq_index', "event_info"])
groups = train2.groupby('seq_index').size().values

#test3 = test2.drop_duplicates(subset=['seq_index', "event_info"])

tgt_cuml = TargetEncoder(n_folds=5, smooth=5e-2)
for c in ['item_domain', 'event_info']:
    train2[c+"_cuml"] = tgt_cuml.fit_transform(train2[c], train2['has_bought'])
    test2[c+"_cuml"] = tgt_cuml.transform(test2[c])

In [None]:
features = ['item_price', 'seq_pos', 'n_views',
       'n_views_this', 'n_views_this_domain', 'unique_items_viewed',
       'unique_domains_viewed', 'item_domain_cuml', 'event_info_cuml', 'n_searches', 'n_unique_searches',
       'avg_search_seqpos', 'avg_search_len', 'avg_search_words', 'n_views_this_ratio', 'n_views_this_ratio_domain', 'viewed']
params = [0.027652448846980884, 6, 1.5196450924014913, 0.15061222682840253, 0.4999203983793246]
learning_rate, max_depth, min_child_weight, subsample, colsample_bytree = params
mdl = xgboost.XGBRanker(seed=0, tree_method='gpu_hist', gpu_id=0, n_estimators=1000,
                           learning_rate=learning_rate, max_depth=max_depth, min_child_weight=min_child_weight,
                            subsample=subsample, colsample_bytree=colsample_bytree, objective='rank:pairwise')
mdl.fit(train2[features], train2['y_rank'], group=groups)

In [None]:
test2[features].head()

In [None]:
p = mdl.predict(test2[features])

In [None]:
pd.DataFrame(p, columns=['22c']).to_parquet("./stack_2f/22c_test.parquet", engine='fastparquet', compression=None)

In [None]:
preds = test2[['seq_index', 'event_info']].copy()
preds['p'] = p
preds = preds.sort_values('p', ascending=False).drop_duplicates(subset=['seq_index', 'event_info'])

In [None]:
def pad(lst):
    pad_candidates = [1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574]
    if len(lst) == 0:
        return pad_candidates
    if len(lst) < 10:
        lst += [lst[0]] * (10 - len(lst)) # pad_candidates[:(10 - len(lst))]
    return np.array(lst)
ypred = preds.groupby("seq_index")['event_info'].apply(lambda x: pad(x.iloc[:10].tolist()))
seq_index = ypred.index
ypred = np.array(ypred.tolist())


In [None]:
ypred_final = np.zeros((177070, 10))
ypred_final[seq_index, :] = ypred
no_views = np.setdiff1d(np.arange(177070), seq_index)
#ypred_final[no_views, :] = np.array([1587422, 1803710,   10243,  548905, 1906937,  716822, 1361154, 1716388,  725371,  859574])
ypred_final = ypred_final.astype(int)

In [None]:
#permite produtos repetidos
pd.DataFrame(ypred_final).to_csv("./subs/22c.csv", index=False, header=False)

In [None]:
test['seq_index'].max()

In [None]:
!wc -l ./subs/22c.csv

In [None]:
!head ./subs/22c.csv