In [1]:
%cd ..

/notebooks/v2


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from lib.base import *
from lib.data.splitter.sequential2 import Sequential2Splitter
from lib.run.args import Args
from lib.data.datamodule.test2 import Test2
from lib.run.model import Model

Global seed set to 0


In [4]:
import sys
import pickle
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import gc


# data, output_path = sys.argv[1:]
data = 'data'
output_path = 'predictions.npz'


transactions = pd.read_csv(
    f'{data}/tr.csv')
bankclient_embed = transactions.pivot_table(index = 'user_id', 
                        values=['transaction_amt'],
                        columns=['mcc_code'],
                        aggfunc=['sum','mean', 'count']).fillna(0)
bankclient_embed.columns = [f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed.columns]

del transactions
gc.collect()

clickstream = pd.read_csv(
    f'{data}/cl.csv')
clickstream_embed = clickstream.pivot_table(index = 'user_id', 
                        values=['timestamp'],
                        columns=['cat_id'],
                        aggfunc=['count']).fillna(0)
clickstream_embed.columns = [f'{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]
clickstream_embed.loc[0] = np.empty(len(clickstream_embed.columns))

del clickstream
gc.collect()

dtype_clickstream = list()
for x in clickstream_embed.dtypes.tolist():
    if x=='int64':
        dtype_clickstream.append('int16')
    elif(x=='float64'):
        dtype_clickstream.append('float32')
    else:
        dtype_clickstream.append('object')

dtype_clickstream = dict(zip(clickstream_embed.columns.tolist(),dtype_clickstream))
clickstream_embed = clickstream_embed.astype(dtype_clickstream)

dtype_bankclient = list()
for x in bankclient_embed.dtypes.tolist():
    if x=='int64':
        dtype_bankclient.append('int16')
    elif(x=='float64'):
        dtype_bankclient.append('float32')
    else:
        dtype_bankclient.append('object')

dtype_bankclient = dict(zip(bankclient_embed.columns.tolist(),dtype_bankclient))
bankclient_embed = bankclient_embed.astype(dtype_bankclient)

list_of_rtk = list(clickstream_embed.index.unique())
list_of_bank= list(bankclient_embed.index.unique())

submission = pd.DataFrame(list_of_bank, columns=['bank'])
submission['rtk'] = submission['bank'].apply(lambda x: list_of_rtk)

with open("full_list_of_features", "rb") as fp:   # Unpickling
    full_list_of_features = pickle.load(fp)

model = CatBoostClassifier()
model.load_model('model.cbm',  format='cbm')

submission_ready = []

batch_size = 200
# batch_size = 20*2
num_of_batches = int((len(list_of_bank))/batch_size)+1

for i in tqdm(range(num_of_batches)):
    bank_ids = list_of_bank[(i*batch_size):((i+1)*batch_size)]
    if len(bank_ids) != 0:
        part_of_submit = submission[submission['bank'].isin(bank_ids)].explode('rtk')
        part_of_submit = part_of_submit.merge(bankclient_embed, how='left', left_on='bank', right_index=True
                                    ).merge(clickstream_embed, how='left', left_on='rtk', right_index=True).fillna(0)

        for i in full_list_of_features:
            if i not in part_of_submit.columns:
                part_of_submit[i] = 0


        part_of_submit['predicts'] = model.predict_proba(
            part_of_submit[full_list_of_features],
#             task_type='GPU'
        )[:,1]
        part_of_submit = part_of_submit[['bank', 'rtk', 'predicts']]

        zeros_part = pd.DataFrame(bank_ids, columns=['bank'])
        zeros_part['rtk'] = 0.
        zeros_part['predicts'] = 3.8

        part_of_submit = pd.concat((part_of_submit, zeros_part))

        part_of_submit = part_of_submit.sort_values(by=['bank', 'predicts'], ascending=False).reset_index(drop=True)
        part_of_submit = part_of_submit.pivot_table(index='bank', values='rtk', aggfunc=list)
        part_of_submit['rtk'] = part_of_submit['rtk'].apply(lambda x: x[:100])
        part_of_submit['bank'] = part_of_submit.index
        part_of_submit = part_of_submit[['bank', 'rtk']]
        submission_ready.extend(part_of_submit.values)

submission_final = np.array(submission_ready, dtype=object)

print(submission_final.shape)
print(submission_final)
np.savez(output_path, submission_final)


100%|██████████| 15/15 [04:03<00:00, 16.25s/it]

(2930, 2)
[['000932580e404dafbecd5916d4640938'
  list([0.0, 0, '0ec32896a78c469983aba2fe4fb66b97', '02981ce603e140fa9840fddcafe530b1', '0c52e5318c6c4725a5e070630d45b09c', '00b9ef5ab6c04fcd969bab43a898f14e', '107542f0bfcb476f899907b17ffdcd99', '1f2100d56e9c4eddb049cf126841b7dc', '151845e35a554fd39fc5fbe4477aee88', '19b1fb5be667430585ec03d052a7cd84', '0cbc645a2f25461a9b52d520d36a3981', '029f234dfb78484a9e8918f97b08b218', '0ac4431a741746d8873ef46e214e355f', '15621f1f7d854966b10cc32438a7f804', '08b940b588284877828b052f121eaf53', '18f83b3b7f694c0395a047bc693bafa8', '1dd9bc0185b544ee87e585047f235b12', '1345f78619ca44e0a9e264968cd729eb', '026bf40a4c0d4f0bb564d40098e087c7', '010fa6cf11f24bb49c86464c5c413f31', '06fb680c5b3d42df9bb55e60d48d03b9', '1a6a5b4fd85f49b9a2fe676d825b2b91', '1d9278bdb853412ea61f2de3b0670e1c', '1e8e3e1a150b4ea09856c82dc97b6961', '0150bb90129545e889f32003335d27a3', '0078f349d197481c9e2fb43b0a6d1b84', '1d1838a3cc6b4812990f0eff4858247a', '04c17584d2994f7ea64ba76963c01ac9', '




In [44]:
nn=np.load('npz/nn.npz',allow_pickle=True)['arr_0']
len(nn[0][1])

100

In [43]:
cb=np.load('npz/cb.npz',allow_pickle=True)['arr_0']
len(cb[0][1])

100

In [5]:
1/0

ZeroDivisionError: division by zero

In [1]:
1/5

0.2

In [None]:
k = 2930
tr = io.load('data/transactions.feather')
tr = tr[tr.user_id.isin(
    tr.user_id.unique()[:k])]
assert tr.user_id.nunique()==k
io.save(tr,'data/tr.csv')

In [None]:
k = 2463
cl = io.load('data/5000.feather')
cl = cl[cl.user_id.isin(
    cl.user_id.unique()[:k])]
assert cl.user_id.nunique()==k
io.save(cl,'data/cl.csv')

In [41]:
# data_dir, pred_file =sys.argv[1].split('--')[1:]
# data_dir = Path(data_dir)
# pred_file = Path(pred_file)
# print(data_dir) #/data
# print(pred_file) #/output/predictions.npz
# print(os.listdir(data_dir)) #['clickstream.csv', 'transactions.csv']

data_dir = Path('data')
pred_file = Path('predictions.npz')

root = Path('json')
event_encoder = Encoder(root,'event')
event_encoder.load()
uid_encoder = Encoder(root,'uid') 
uid_encoder.load()
uid = 'user_id'
ts = 'timestamp'

#cl = pd.read_csv(data_dir/'clickstream.csv')
cl = pd.read_csv(data_dir/'cl.csv')
# print(len(cl))
# if len(cl)>1000:
#     1/0   

cl[ts] = cl[ts].progress_apply(pd.Timestamp)
event = 'cat_id'
cl[event] = cl[event].apply(
    lambda x: f'rtk_{x}')
cl[event]=event_encoder.transform(cl[event])
cl[uid] = uid_encoder.transform(cl[uid])

# tr = pd.read_csv(
#     data_dir/'transactions.csv')
tr = pd.read_csv(data_dir/'tr.csv')
tr = tr.rename(
    columns={'transaction_dttm': ts})
tr[ts] = tr[ts].progress_apply(pd.Timestamp)
event = 'mcc_code'
tr[event] = tr[event].apply(
    lambda x: f'bank_{x}')
tr[event]=event_encoder.transform(tr[event])
tr[uid] = uid_encoder.transform(tr[uid]) 

bank = sorted(
    tr.user_id.unique().tolist())
rtk = sorted(
    cl.user_id.unique().tolist())   
max_len = max(len(bank),len(rtk))
df = pd.DataFrame(index=range(max_len))

splitter = Sequential2Splitter()
XC,YC,XT,YT = splitter.run(cl,tr)
#       df['bank'] = (bank+[-1]*len(bank))[:max_len]
#       df['rtk'] = (rtk+[-1]*len(rtk))[:max_len]

df['bank'] = (bank+bank)[:max_len]
df['rtk'] = (rtk+rtk)[:max_len]

df = df.fillna(-1)

del cl, tr
gc.collect()

a = Args(
    splitter = 'Sequential',
    splitter_pp = dict(
        n_days_in_sample = 30,
        bank_quantile = 0.9,
        rtk_quantile = 0.9,
    ),    
    n_folds = 3,# 1000 == 'full train'
    fold = 0,

    fit_limit = 1.,
    val_limit = 1.,

    batch_size = 32,    
    lr = 2e-3,
    n_epochs = 10,
    check_val_every_n_epoch = 1,

    bb_pp = dict(
        block_size = 16,
        hidden_size = 128,
        intermediate_size = 128,
        num_attention_heads = 1,
        num_hidden_layers = 1,
        num_random_blocks = 1,
    ),

    loss = 'MarginLoss',
    loss_pp = dict(),

    use_unmatched = bool(0),

    miner = None,
    miner_pp = dict(),

    avg_loss = 'mean',
    avg_pred = 'mean',
)
a.update()
a.bank_len = XT.shape[1]
a.rtk_len = XC.shape[1] 

def collate(DD):
    AB = [A+B for A in 'XY' for B in 'TC']
    kk = AB+['MT','MC']+['bank','rtk','M']
    B = {k:[] for k in kk}
    for D in DD:
        for k in B:
            if k in D:
                B[k].append(D[k])
    for k in B:
        if k in AB+['MT','MC']:
            B[k] = torch.cat(B[k])
        if k in ['bank','rtk','M']:
            B[k] = torch.tensor(B[k])
    return B

c = Args()
c.event_encoder = event_encoder
c.uid_encoder = uid_encoder
d = Args()
d.P = df
d.XT = XT
d.XC = XC
d.YT = YT
d.YC = YC
c.test = Test2(d, collate, a)

model = Model(a,c)

callbacks = [
    pl.callbacks.model_checkpoint.ModelCheckpoint(
        save_weights_only = bool(0),
        filename = '{R1} {MRR} {P}', 
        monitor = 'R1', 
        verbose = False,
        save_last = bool(1),
        save_top_k = 1, 
        mode = 'max', 
    ),
]
trainer = pl.Trainer(
    accumulate_grad_batches = a.acc_batches,
#            val_check_interval=a.val_check_interval,
    check_val_every_n_epoch=a.check_val_every_n_epoch,
    num_sanity_val_steps = 0,
    deterministic = bool(0) if a.avg_loss=='median' or a.avg_pred=='median' else bool(1),
    benchmark = bool(1),
    gpus = a.gpus,
    precision = a.precision,
    logger = pl.loggers.CSVLogger(
        str(a.log_dir), name=a.exp_name),
    callbacks = callbacks,
    max_epochs = a.n_epochs,
    limit_train_batches = a.fit_limit,
    limit_val_batches = a.val_limit,
)   

p = a.log_dir
p /= a.exp_name
p /= 'version_0'
p /= 'checkpoints'
a.ckpt = p/'last.ckpt'
print(a.ckpt.stem)
BB = trainer.predict(
    model, c.test, ckpt_path=a.ckpt)  
pred = model.predict_epoch_end(BB)
#         print(pred.sample().T)

path = a.csv_dir/f'{a.ckpt.stem}.csv'
c = 'rtk_list'
#         pred[c] = pred[c]\
#             .apply(lambda x: str(x))\
#             .replace("'", '', regex=True)
pred[c] = pred[c].apply(
    lambda x: ([0.0, 0]+x)[:100])
print(pred.values)
np.savez(str(pred_file), pred.values)


100%|██████████| 14797017/14797017 [01:03<00:00, 234763.28it/s]
100%|██████████| 2611403/2611403 [00:10<00:00, 247308.54it/s]


Sequential2Splitter __init__...
Sequential2Splitter run...
Sequential2Splitter get_tensors...


100%|██████████| 2463/2463 [00:13<00:00, 183.21it/s]


Sequential2Splitter pad...


100%|██████████| 16636/16636 [00:00<00:00, 53048.18it/s]
100%|██████████| 16636/16636 [00:00<00:00, 22008.19it/s]


torch.Size([16636, 2256])
torch.Size([16636])
rtk_uids 2463
Sequential2Splitter get_tensors...


100%|██████████| 2930/2930 [00:22<00:00, 128.99it/s]


Sequential2Splitter pad...


100%|██████████| 37225/37225 [00:00<00:00, 336402.84it/s]
100%|██████████| 37225/37225 [00:00<00:00, 271802.67it/s]
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Restoring states from the checkpoint path at log/f8dc2a144b2ebdfd78e0626fdb789a3b/version_0/checkpoints/last.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at log/f8dc2a144b2ebdfd78e0626fdb789a3b/version_0/checkpoints/last.ckpt


torch.Size([37225, 120])
torch.Size([37225])
bank_uids 2930
f8dc2a144b2ebdfd78e0626fdb789a3b
last
P: 2930


Predicting: 0it [00:00, ?it/s]

[['000932580e404dafbecd5916d4640938'
  list([0.0, 0, '0cc67621d9704242bc0a244798d5ef51', '07670bb423c543c08f7981ba0119981f', '013b1b6b156d4f30be91aa17c2198a7e', '013b1b6b156d4f30be91aa17c2198a7e', '0d13c4cf161e4d158c930c65521e645d', '0db38908c5314a51b54be097ad74bf84', '1d28ceb96bdd49129cc3b0a147e1face', '05ffb8c1f84243ab85b49b88074a7f4d', '05ffb8c1f84243ab85b49b88074a7f4d', '013999d43652464880e7b8096083a0a6', '013999d43652464880e7b8096083a0a6', '18e933e6039e45998718ff5fe5478bd8', '205beaa6086744958679d0e0d67c31f7', '1c3a8842e1af4056bd76b567f65217d5', '054f26620400439e9cd35b6a9dd39ebd', '054f26620400439e9cd35b6a9dd39ebd', '0ea647ec5ecf434fb943d1f730b5d13e', '19b0806d7bd8462fbdb77a748da6895c', '001bf72732ed4e68aceff74aeea6efc9', '001bf72732ed4e68aceff74aeea6efc9', '0962744bdb59443a8d933ae5eb8ca0b9', '0241a5e9968c4b62b279ef3864acfdd7', '0241a5e9968c4b62b279ef3864acfdd7', '05710b9d414044668d2941764a135ff7', '05710b9d414044668d2941764a135ff7', '157234c24fa543a6b890727769a9c47f', '08ea6814bc

In [42]:
os.rename(
    'predictions.npz',
    'npz/nn.npz'
)

In [10]:
x = np.load(
    'predictions.npz', allow_pickle=bool(1))

In [11]:
x['arr_0']

array([['000932580e404dafbecd5916d4640938',
        list([0.0, 0, '0ec32896a78c469983aba2fe4fb66b97', '02981ce603e140fa9840fddcafe530b1', '0c52e5318c6c4725a5e070630d45b09c', '00b9ef5ab6c04fcd969bab43a898f14e', '107542f0bfcb476f899907b17ffdcd99', '1f2100d56e9c4eddb049cf126841b7dc', '151845e35a554fd39fc5fbe4477aee88', '19b1fb5be667430585ec03d052a7cd84', '0cbc645a2f25461a9b52d520d36a3981', '029f234dfb78484a9e8918f97b08b218', '0ac4431a741746d8873ef46e214e355f', '15621f1f7d854966b10cc32438a7f804', '08b940b588284877828b052f121eaf53', '18f83b3b7f694c0395a047bc693bafa8', '1dd9bc0185b544ee87e585047f235b12', '1345f78619ca44e0a9e264968cd729eb', '026bf40a4c0d4f0bb564d40098e087c7', '010fa6cf11f24bb49c86464c5c413f31', '06fb680c5b3d42df9bb55e60d48d03b9', '1a6a5b4fd85f49b9a2fe676d825b2b91', '1d9278bdb853412ea61f2de3b0670e1c', '1e8e3e1a150b4ea09856c82dc97b6961', '0150bb90129545e889f32003335d27a3', '0078f349d197481c9e2fb43b0a6d1b84', '1d1838a3cc6b4812990f0eff4858247a', '04c17584d2994f7ea64ba76963c01ac9'

In [None]:
pip install catboost

In [None]:
1

In [None]:
tr.columns

In [None]:
cl.columns

In [None]:
 len((rtk+rtk+rtk+rtk+rtk+rtk)[:max_len])

In [None]:
 len((bank+bank+bank+bank+bank+bank+bank)[:max_len])