In [None]:
from google.colab import drive
drive.mount('/content/drive')

# load dependencies

In [None]:
import sys
!pip install pytorch-lifestream

In [None]:
!pip install pytorch-lightning==1.8.5
!pip install torchvision==0.12.0

# import libraries

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.nn import functional as F
import random
import tqdm

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
seed = 21
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Dataloader

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, SampleRandom
from ptls.frames import PtlsDataModule
from ptls.frames.supervised import seq_to_target
import ptls

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        data=ptls.data_load.datasets.AugmentationDataset(
            f_augmentations=[ptls.data_load.augmentations.DropoutTrx(trx_dropout=0.01)],
            data=ptls.data_load.datasets.MemoryMapDataset(
                data=ptls.data_load.datasets.parquet_dataset.ParquetDataset(
                    i_filters=[
                        ptls.data_load.iterable_processing.SeqLenFilter(min_seq_len=25),
                        # ptls.data_load.iterable_processing.FeatureFilter(),
                        ],
                     data_files=ptls.data_load.datasets.parquet_file_scan(
                        file_path='drive/MyDrive/ptls-experiments/scenario_age_pred/data/train_trx_file.parquet',
                        valid_rate=0.05,
                        return_part='train'
                     )
                    )
            )
        ),
        splitter=SampleSlices(
            split_count=1,
            cnt_min=25,
            cnt_max=75,
        ),
    ),
    valid_data=ColesDataset(
        data=ptls.data_load.datasets.MemoryMapDataset(
                data=ptls.data_load.datasets.parquet_dataset.ParquetDataset(
                    i_filters=[
                        ptls.data_load.iterable_processing.FeatureFilter(),
                        ],
                     data_files=ptls.data_load.datasets.parquet_file_scan(
                        file_path='drive/MyDrive/ptls-experiments/scenario_age_pred/data/train_trx_file.parquet',
                        valid_rate=0.05,
                        return_part='valid'
                     )
            )
        ),
        splitter=SampleSlices(
            split_count=1,
            cnt_min=25,
            cnt_max=50,
        ),
    ),
    train_num_workers=8,
    train_batch_size=64,
    valid_batch_size=256,
    valid_num_workers=16,
)

In [None]:
import torch.nn as nn
from ptls.nn import TrxEncoder
from ptls.data_load.padded_batch import PaddedBatch

trx_encoder_params = dict(
    embeddings_noise=0.0003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
)

trx_encoder_params_identity = dict(
    numeric_values={
                    'amount_rur': 'identity',
                    'trans_date': 'identity',
                    'small_group': 'identity',
    },
    use_batch_norm_with_lens=False,
    use_batch_norm=False,
    norm_embeddings=False,
)

trx_enc = TrxEncoder(**trx_encoder_params)
id_trx_enc = TrxEncoder(**trx_encoder_params_identity)

In [None]:
cat_feats = ['trans_date', 'small_group']
num_feats = ['amount_rur']
all_feats = cat_feats + num_feats

In [None]:
train_df = pd.read_parquet('drive/MyDrive/ptls-experiments/scenario_age_pred/data/train_trx_file_cut.parquet')

In [None]:
test_df = pd.read_parquet('drive/MyDrive/ptls-experiments/scenario_age_pred/data/test_trx_file_cut.parquet')

In [None]:
vocab = dict()
for feat in cat_feats:
  vocab[feat] = list(np.unique(np.concatenate(list(train_df[feat]))))
  vocab[feat].sort()
  vocab[feat] += [0,]

# Model structure

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, cat_feats, num_feats, vocab, dims_of_embeddings, device, dec_hid_size, n_layers=1, dropout=0.2, bidir=False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.device = device

        if isinstance(dims_of_embeddings, int):
          dims_of_embeddings = {feat: dims_of_embeddings for feat in cat_feats}

        self.input_size = sum([value for value in dims_of_embeddings.values()]) + len(num_feats)

        self.vocab = vocab
        self.cat_feats = cat_feats
        self.num_feats = num_feats
        self.all_feats = cat_feats + num_feats

        self.embedding_layers = nn.ModuleDict()

        for feat, dim_of_emb in dims_of_embeddings.items():
          self.embedding_layers[feat] = nn.Embedding(max(vocab[feat])+1, dim_of_emb, padding_idx=0)

        self.dirs = 2 if bidir else 1
        self.n_layers = n_layers
        self.gru = nn.GRU(self.input_size, hidden_size, batch_first=True, num_layers=n_layers, bidirectional=bidir)
        self.dropout = nn.Dropout(dropout)

    def get_embeddings(self, input_features):
        embed_feats = None

        for i, feat in enumerate(self.all_feats):
          if feat in self.cat_feats:
            raw = self.embedding_layers[feat](input_features[:,:, i].type(torch.long))
          else:
            raw = input_features[:,:, i][:, :, None]

          if embed_feats is not None:
            embed_feats = torch.cat([embed_feats, raw], dim=2)
          else:
            embed_feats = raw
        return embed_feats

    def forward(self, input, hidden_0=None):
        # (batch_size, len_seq, dim_emb) -> (batch_size, len_seq, hidden_size), (1, batch_size, hidden_size)
        #  out[:, -1, :] == hidden_n.reshape(batch_size, hidden_size)
        embedded = self.get_embeddings(input)

        embedded = self.dropout(embedded)

        # hidden_0 = torch.zeros(self.dirs * self.n_layers, embedded.shape[0], self.hidden_size, device=self.device)\
                                                      #  if hidden_0 is None else hidden_0
        output, hidden_n = self.gru(embedded)

        hidden_n = torch.nn.ReLU()(hidden_n)

        return output, hidden_n

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))

        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, cat_feats, num_feats, vocab, dims_of_embeddings, device, n_layers=1, dropout=0.2):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.device = device

        self.vocab = vocab
        self.cat_feats = cat_feats
        self.num_feats = num_feats
        self.all_feats = cat_feats + num_feats

        if isinstance(dims_of_embeddings, int):
            dims_of_embeddings = {feat: dims_of_embeddings for feat in cat_feats}
        elif isinstance(dims_of_embeddings, list):
            assert len(dims_of_embeddings) == len(self.all_feats)
            dims_of_embeddings = {feat: value for feat, value in zip(self.all_feats, dims_of_embeddings)}

        self.input_size = sum([value for value in dims_of_embeddings.values()]) + len(num_feats)

        self.embedding_layers = nn.ModuleDict()
        for feat, dim_of_emb in dims_of_embeddings.items():
            self.embedding_layers[feat] = nn.Embedding(max(vocab[feat])+1, dim_of_emb, padding_idx=0)

        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(self.input_size, hidden_size, batch_first=True, num_layers=n_layers)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.LogSoftmax(dim=1)

    def get_embeddings(self, input):
        out = None
        for i, feat in enumerate(self.all_feats):
            raw = input[:, :, i]
            if feat in self.cat_feats:
                raw = raw.type(torch.long)
                emb_layer = self.embedding_layers[feat]
                raw = emb_layer(raw.to(self.device))
            else:
                raw = raw[:, :, None].to(self.device)
            if out is not None:
                out = torch.cat([out, raw], dim=2)
            else:
                out = raw
        return out

    def forward(self, input_features, hidden, encoder_outputs):
        embedded = self.get_embeddings(input_features).type(torch.float).to(self.device)

        if encoder_outputs.shape[2] == hidden.shape[2] * 2:
          encoder_outputs = (encoder_outputs[:, :, :hidden.shape[2]] + encoder_outputs[:, :, hidden.shape[2]:])/2

        query_h = hidden.permute(1, 0, 2)
        hiddens_to_decoder = []
        attn_weights_by_layers = []
        for i in range(query_h.shape[1]):
          hidden_i_dec, _ = self.attention(query_h[:, i, :][:, None, :], encoder_outputs)
          hiddens_to_decoder.append(hidden_i_dec)
          # attn_weights_by_layers.append(attn_weights)

        if encoder_outputs.shape[1] > 0:
            hiddens_to_decoder = torch.cat(hiddens_to_decoder, dim=1).permute(1,0,2)
        else:
            hiddens_to_decoder = hidden

        embedded = self.dropout(embedded)
        output, hidden = self.gru(embedded, hiddens_to_decoder.contiguous())

        return output, hidden

In [None]:
class AeBaseline(nn.Module):
    def __init__(self, encoder, decoder, device, cat_feats, num_feats, vocab):
        super(AeBaseline, self).__init__()

        self.enc = encoder
        self.dec = decoder
        self.device = device

        self.cat_feats = cat_feats
        self.num_feats = num_feats
        self.all_feats = cat_feats + num_feats
        self.vocab = vocab


        self.heads = nn.ModuleDict()
        for feat, value in self.vocab.items():
            vocab_size = max(value)+1
            self.heads[feat] = nn.Linear(decoder.hidden_size, vocab_size)
        for feat in self.num_feats:
            self.heads[feat] = nn.Linear(decoder.hidden_size, 1)


    def new_ts_unit(self, output):
        new_input = None
        logits = []

        for i, feat in enumerate(all_feats):
            if feat in cat_feats:
                predict = self.heads[feat](output)
                logits.append(predict)
                predict = predict.max(dim=2).indices[:, :, None]
            else:
                predict = self.heads[feat](output)

            if new_input is None:
                new_input = predict.type(torch.float)
            else:
                new_input = torch.cat([new_input, predict], dim=2)
        return new_input, logits

    def dict2tensor(self, a, all_feats):
      # a[0].payload is a dict of features
      input_features = []
      minim = a[0].seq_feature_shape[1]
      maxim = minim

      for feat in all_feats:
        if feat == all_feats[0]:
          for raw in a[0].payload[feat]:
            len_filled = len(raw.argwhere())
            if len_filled < minim:
              minim = len_filled

        input_features.append(torch.cat(tuple(a[0].payload[feat])).reshape(1,-1))

      input_features = torch.cat(input_features).permute(-1, 0).reshape(*a[0].seq_feature_shape, len(all_feats))
      input_features = input_features.type(torch.float)


      return input_features, maxim

    def forward(self, input_features, minim, hidden_0=None, teacher_forcing_ratio = 0.5):
        encoder_outputs, hidden = self.enc(input_features, hidden_0)

        batch_size, len_seq, quantity_of_feats = input_features.shape

        input_to_decoder = torch.zeros(batch_size, 1, quantity_of_feats).to(self.device)

        if self.enc.dirs == 2:
          hidden = [i_layer_hidden[None, :, :]  for i_layer_hidden in list(hidden)]
          temp_hidden = []

          for i in range(self.enc.n_layers):
            temp_hid_unit = torch.cat(hidden[2*i:2*i+2], dim=0).mean(dim=0)[None,:,:]
            temp_hidden.append(temp_hid_unit)

          hidden = torch.cat(temp_hidden, dim=0).to(self.device)

        predicts = []
        all_logits = []
        for i in range(minim):
            output, hidden = self.dec(input_to_decoder, hidden, encoder_outputs[:, :i, :])
            input_new, logits = self.new_ts_unit(output)
            teacher_force = random.random() < teacher_forcing_ratio
            input_to_decoder = input_features[:, i, :][:, None, :].to(self.device) if teacher_force else input_new
            predicts.append(input_new) # was -- input_to_decoder
            all_logits.append(logits)

        predicts = torch.cat(predicts, dim=1)

        logits_to_out = []

        for i, feat in enumerate(self.all_feats):
          if feat in self.cat_feats:
            pred = [probs[i] for probs in all_logits]
            pred = torch.cat(pred, dim=1)
          else:
            pred = predicts[:, :, i][:, :, None]
          logits_to_out.append(pred)

        return predicts, logits_to_out

# Model definition

In [None]:
embedding_dims = {
                   'trans_date': 32,
                   'small_group': 32,
                   }
enc_hid_size = 512
dec_hid_size = 512

enc = EncoderRNN(enc_hid_size, cat_feats, num_feats, vocab, embedding_dims, device, dec_hid_size, bidir=False, n_layers=1)
dec = DecoderRNN(dec_hid_size, cat_feats, num_feats, vocab, embedding_dims, device, n_layers=1)

model = AeBaseline(enc, dec, device, cat_feats, num_feats, vocab).to(device)


# Train

## train epoch

In [None]:
def train_epoch(dataloader, model, optimizer, cat_loss, num_loss, teacher_forcing=0.5):
    total_loss = 0
    model.train()
    data_length = 0
    for iteration, data in enumerate(tqdm.tqdm(dataloader)):
        optimizer.zero_grad()

        prepared_data, minim = model.dict2tensor(data, model.all_feats)
        prepared_data = prepared_data.to(device)
        data_length += prepared_data.shape[0]
        predicted, logits = model(prepared_data, minim, teacher_forcing_ratio=teacher_forcing)

        loss = 0

        for i, feat in enumerate(model.all_feats):
          if feat in model.cat_feats:
            # target = F.one_hot(prepared_data[:, :minim, i].type(torch.long), max(model.vocab[feat])+1).type(torch.float)
            target = prepared_data[:, :minim, i].type(torch.long)
            logit_pred = logits[i].permute(0, 2, 1)
            loss += cat_loss(logit_pred, target) #* max(model.vocab[feat]) #.detach().cpu().numpy()
          else:
            target = prepared_data[:, :minim, i][:, :, None]
            masked_target = target != 0
            pred = predicted[:, :, i][:, :, None]
            loss_temp = num_loss(target, pred)*masked_target    #.detach().cpu().numpy()
            loss += loss_temp.mean()

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss / data_length

## evaluate func

In [None]:
def evaluate(dataloader, model, cat_metric, num_metric):
    model.eval()
    metrics = dict(zip(model.all_feats, [0 for i in range(len(model.all_feats))]))
    count_of_samples = 0
    with torch.no_grad():
        for i, vdata in enumerate(dataloader):
            vdata, minim = model.dict2tensor(vdata, model.all_feats)
            vdata = vdata.to(device)
            predicted, logits = model(vdata, minim, teacher_forcing_ratio=0.0)
            count_of_samples += vdata.shape[0] * vdata.shape[1]
            for i, feat in enumerate(model.all_feats):
                if feat in model.cat_feats:
                    metrics[feat] += (predicted[:, :, i] == vdata[:, :minim, i]).sum().type(torch.float).item()
                else:
                    metrics[feat] += ((predicted[:, :, i] - vdata[:, :minim, i])**2).sum().type(torch.float).item()


    for feat, metric_value in metrics.items():
      metrics[feat] /= count_of_samples

    return metrics

## train main

In [None]:
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt

learning_rate = 0.001

cat_loss =  nn.CrossEntropyLoss(ignore_index=0)
num_loss = nn.MSELoss(reduction='none')

n_epochs = 150
teacher_forcing = 0.5
steps_teacher_forcing = 4
gamma_teacher_forcing = 0.8
step_size_lr = 30
gamma_lr = 0.9025

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size_lr, gamma=gamma_lr)

min_loss = None
model_name = 'age_pred_model_gru.p'

plot_losses = []
for epoch in range(1, n_epochs + 1):
  if epoch % steps_teacher_forcing == 0:
      teacher_forcing *= gamma_teacher_forcing

  loss = train_epoch(train_dl.train_dataloader(), model, optimizer, cat_loss, num_loss, teacher_forcing=teacher_forcing)
  if min_loss is None:
    min_loss = loss
  else:
    if loss < min_loss:
      min_loss = loss
      torch.save(model.state_dict(), 'drive/MyDrive/ae_baseline_model/best_' + model_name)

  scheduler.step()

  print(f"epoch {epoch}; loss {loss}")
  plot_losses.append(loss)
  torch.save(model.state_dict(), 'drive/MyDrive/ae_baseline_model/' + model_name)
  if epoch % 15 == 0:
    evaluate_metrics = evaluate(train_dl.val_dataloader(), model, None, None)
    print(evaluate_metrics)

plt.plot(plot_losses)

# Inference and embedding validation

## inference + saving embeddings

In [None]:
model_path = '/content/drive/MyDrive/ae_baseline_model/ae_model.p'
embeddings_path = '/content/drive/MyDrive/ae_data/ae_model.csv'

model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

all_df = pd.concat([train_df, test_df]).set_index('client_id')
all_df.index = all_df.index.astype(np.int64)

embs_of_seqs = []
target_labels = []
for i in tqdm.tqdm(range(all_df.shape[0])):
# for i in tqdm.tqdm(range(100)):
  input_tensor = torch.zeros(1, all_df.iloc[i]['small_group'].shape[0], len(all_feats))
  for j, feat in enumerate(all_feats):
    input_tensor[:, :, j] = torch.tensor(all_df.iloc[i][feat])

  model.eval()

  with torch.no_grad():
    try:
      _, hidden = model.enc(input_tensor.to(device))
    except:
      for j, feat in enumerate(cat_feats):
        # print(max(vocab[feat]), all_df.iloc[i][feat].max(), feat)
        replace_to_pad = np.vectorize(lambda x: x if x <= max(vocab[feat]) else 0 )
        input_tensor[:, :, j] = torch.tensor(replace_to_pad(all_df.iloc[i][feat]))
      _, hidden = model.enc(input_tensor.to(device))

  hidden = hidden.reshape(-1).detach().cpu().numpy()

  target_labels.append(all_df.iloc[i]['target'])
  embs_of_seqs.append(hidden)

df_of_embs = pd.DataFrame(embs_of_seqs)

In [None]:
targets = pd.read_csv(
    '/content/drive/MyDrive/ptls-experiments/scenario_age_pred/data/train_target_cut.csv'
    )

In [None]:
targets.columns

In [None]:
all_df = all_df.merge(targets[['client_id','has_rare_small_group', 'amount_group']], on='client_id', how='left')

In [None]:
df_of_embs = df_of_embs.set_index(all_df.client_id)
df_of_embs = df_of_embs.merge(all_df.reset_index()[['client_id' , 'target', 'has_rare_small_group', 'amount_group']],
                              how='left', on='client_id')
df_of_embs.to_csv(embeddings_path, index=False)

## emb validation

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
import random

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ae_data/best_new_ae_512_tf0_max_epoch_age_pred_cut.csv', index_col='client_id')
test_ids = pd.read_csv('/content/drive/MyDrive/ptls-experiments/scenario_age_pred/data/test_ids_file.csv')
TARGET_COLS = ['target', 'has_rare_small_group', 'amount_group']

In [None]:
SEED = 21
np.random.seed(21)
random.seed(21)

TARGET_COLS = [col for col in df.columns if col in TARGET_COLS]
TARGET_COLUMN = TARGET_COLS[2]

df_train = df[df.index.isin(test_ids['client_id']) == False]
df_train, targets_train = df_train.drop(TARGET_COLS, axis=1), df_train[TARGET_COLUMN]

df_test = df[df.index.isin(test_ids['client_id'])]
df_test, targets_test = df_test.drop(TARGET_COLS, axis=1), df_test[TARGET_COLUMN]

In [None]:
df_train.shape

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
indexes = np.arange(len(targets_train))

arr_acc_test = []
arr_auc_test = []
arr_acc_val = []
arr_auc_val = []

clf = pipe = make_pipeline(
        MaxAbsScaler(),
        LogisticRegression(
            random_state=21,
            max_iter=1000000,
            multi_class='ovr'
        )
    )
binary_clf = False

if targets_train.unique().shape[0] == 2:
    binary_clf = True

for train, val in kf.split(indexes, targets_train):
  X_train, X_val, y_train, y_val = df_train.iloc[train], df_train.iloc[val], targets_train.iloc[train], targets_train.iloc[val]
  clf.fit(X_train, y_train)

  acc_test = clf.score(df_test, targets_test)
  acc_val = clf.score(X_val, y_val)

  if binary_clf:
      pred_label_test = clf.predict_proba(df_test)[:, 1]
      pred_label_val = clf.predict_proba(X_val)[:, 1]
  else:
      pred_label_test = clf.predict_proba(df_test)
      pred_label_val = clf.predict_proba(X_val)

  auc_test = roc_auc_score(targets_test, pred_label_test, average='macro', multi_class='ovr')
  auc_val = roc_auc_score(y_val, pred_label_val, average='macro', multi_class='ovr')

  arr_acc_test.append(acc_test)
  arr_auc_test.append(auc_test)
  arr_acc_val.append(acc_val)
  arr_auc_val.append(auc_val)

arr_acc_test = np.array(arr_acc_test)
arr_auc_test = np.array(arr_auc_test)
arr_acc_val = np.array(arr_acc_val)
arr_auc_val = np.array(arr_auc_val)


clf.fit(df_train, targets_train)
acc = clf.score(df_test, targets_test)
if binary_clf:
    pred_label = clf.predict_proba(df_test)[:, 1]
else:
    pred_label = clf.predict_proba(df_test)

roc_auc = roc_auc_score(targets_test, pred_label, average='macro', multi_class='ovr')

print(TARGET_COLUMN)
print('Val:  acc_mean: {0}, auc_mean: {1}, acc_std: {2}, auc_std: {3}'.\
          format(arr_acc_val.mean(), arr_auc_val.mean(), arr_acc_val.std(), arr_auc_val.std()))
print('Test: acc_mean: {0}, auc_mean: {1}, acc_std: {2}, auc_std: {3}'.\
          format(arr_acc_test.mean(), arr_auc_test.mean(), arr_acc_test.std(), arr_auc_test.std()))
print('Full trained model: acc: {0}, auc: {1}'.format(acc, roc_auc))