In [1]:
import numpy as np
import random
import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import os
from tqdm import tqdm
import _pickle as cPickle

from tensorboardX import SummaryWriter
from datetime import datetime
from sklearn.metrics import balanced_accuracy_score

# 3. NN training.

In [None]:
class SpyHunterDataset(Dataset):
    def __init__(self, set_data: pd.DataFrame, file_name: str, mode='train'):
        super(SpyHunterDataset, self).__init__()
        # 1. Process the data or load already processed.
        # 1.1. Group by User and Sort transactions by a datetime. Remove the Datetime column.
        # 1.2. Divide into clear and fraud users.
        # 1.3. Normalize.
        # 1.4. Create helpful arrays for fast indexing (target).
        # 1.5. Create helpful arrays for fast indexing (n_transactions).
        # 1.6. Dump into the file to reopen fast in the future.
        if not os.path.exists(file_name):
            # 1.1.
            set_data = set_data.sort_values('Datetime')
            set_data = set_data[['User', 'Amount']]
            # 1.2.
            set_data_clear_users = set_data[set_data['IsFraud_target'] == 0]['User'].unique()
            set_data_fraud_users = set_data[set_data['IsFraud_target'] == 1]['User'].unique()
            # 1.3.
            set_data_grouped = set_data.groupby('User')
            a_max = set_data_grouped['Amount'].max()
            a_min = set_data_grouped['Amount'].min()
            for user_num in tqdm(set_data_grouped.groups):
                # Normalization is defined for each user individually
                l_normalize = lambda x: (x - a_min[user_num]) / (a_max[user_num] - a_min[user_num])
                rows = set_data['User'] == user_num
                set_data.loc[rows, 'Amount'] = set_data.loc[rows, 'Amount'].apply(l_normalize)
            del set_data_grouped
            set_data_grouped_normalized = set_data.groupby('User')

            # 1.4.
            any_user_id_max = set_data['User'].unique().max()
            users_array_target = np.zeros(any_user_id_max + 1, dtype=np.int32) - 1
            users_array_target[set_data_clear_users] = 0
            users_array_target[set_data_fraud_users] = 1
            # 1.5.
            set_data_users_n_transactions = set_data_grouped_normalized.count()['Amount']
            users_array_n_transactions = np.zeros(any_user_id_max + 1, dtype=np.int32) - 1
            keys = np.array(set_data_users_n_transactions.keys())
            values = np.array(set_data_users_n_transactions.values)
            users_array_n_transactions[keys] = values
            del set_data_users_n_transactions, keys, values

            new_data = [set_data_grouped_normalized,
                        set_data_clear_users,
                        set_data_fraud_users,
                        users_array_target,
                        users_array_n_transactions]
            with open(file_name, 'wb') as f:
                cPickle.dump(new_data, f)
        else:
            with open(file_name, 'rb') as f:
                new_data = cPickle.load(f)

        self.set_data_grouped_normalized: pd.DataFrameGroupBy = new_data[0]
        self.set_data_clear_users: np.ndarray = new_data[1]
        self.set_data_fraud_users: np.ndarray = new_data[2]
        self.users_array_target: np.ndarray = new_data[3]
        self.users_array_n_transactions: np.ndarray = new_data[4]

        # 2. For train set separate Users into clear and fraud subgroups to make them equal in size,
        # for valid and test it is not required.
        assert mode in ('train', 'eval')
        self.chosen_clear = self.set_data_clear_users
        self.chosen_fraud = self.set_data_fraud_users
        if mode == 'train':
            self.rebalance()
        # 3. Divide the data into N bins to make sampling, training and finally a convergence faster.
        self.chosen_all = np.stack((self.chosen_clear, self.chosen_fraud))
        self.order = self.chosen_all
        self.reshuflle()
        return

    def rebalance(self, technique='Downsample'):
        # Since clear and fraud users are presented in an unequal amount, for stable binary prediction
        # it is required to train in a balanced more. There are some techniques to achieve it.
        # 1. Downsample: remove Users from a bigger group. Used by default.
        # 2. Upsample: duplicate Users from a smaller group.
        # 3. Resample: another technique to upsample a smaller group and downsample a bigger group.
        # 4. Other weird sampling techniques.

        if technique == 'Downsample':
            n_clear = len(self.set_data_clear_users)
            n_fraud = len(self.set_data_fraud_users)
            if n_clear < n_fraud:
                self.chosen_clear = self.set_data_clear_users
                self.chosen_fraud = random.sample(population=self.set_data_fraud_users, k=n_clear)
            else:
                self.chosen_clear = random.sample(population=self.set_data_clear_users, k=n_fraud)
                self.chosen_fraud = self.set_data_fraud_users
        else:
            assert False
        return None

    def reshuflle(self, n=10):
        # Short: divide all the sequences into n bins (define by data analysis or choose any number intuitively).
        # 1. Get the values
        # 2. Sort users according to these values, sort values also.
        # 3. Sort and divide the array into n equal bins.
        # 4. Shuffle users in each bin.
        # Comments
        # - Smaller n - more rough, may be useful during training to increase a diversity
        # of samples in a batch. Especially useful when batch size is smaller, but may reduce
        # the computation gain. For big batches, shuffling will not increase diversity since samples may
        # continue to be in one pool.
        # - Higher n - better accuracy estimation, may be useful during validation
        # since the impact of zeros at the start is reduced
        values = self.users_array_n_transactions[self.chosen_all]
        indexes = np.argsort(values)
        self.order = self.chosen_all[indexes]
        assert n < self.chosen_all
        nth_path = len(self.chosen_all) // n
        print('Shuffling statistics')
        for i in range(n):
            left = i * nth_path
            right = min(len(self.chosen_all), (i + 1) * nth_path)
            print(f'--> Group {i:>2} '
                  f'MIN {self.users_array_n_transactions[self.chosen_all[left]]:>6} '
                  f'MAX {self.users_array_n_transactions[self.chosen_all[right]]:>6}')
            random.shuffle(self.order[left:right])
        print('Finished shuffling')
        return

    def __len__(self):
        return len(self.order)

    def __getitem__(self, item):
        # Return a sequence for a User
        user_index = self.order[item]
        sequence = self.set_data_grouped_normalized.get_group(user_index)
        # An idea proposed by a the teacher is to reduce the sequence, but it is not clear how.
        return sequence, len(sequence), self.users_array_target[user_index]



In [None]:
def transactions_collate_fn(data):
    # 1. Batch randomization is enabled with Dataset data ordering.
    # 2. Pad all the sequences up to longest sequence length in the batch.
    # 2.1. Why to pad in this method: https://discuss.pytorch.org/t/how-to-use-collate-fn/27181/4
    # 2.2. Left zero padding is preferred over right zero padding to prevent gradient vanishing,
    # seems to be equally important to the problem described in https://arxiv.org/abs/1409.3215
    # Comments
    # Another example
    # https://stackoverflow.com/questions/65279115/how-to-use-collate-fn-with-dataloaders

    sequences, lengths, labels = zip(*data)
    # Do not know if these arrays are numpy arrays
    max_len = max(lengths)
    sequences_new = torch.zeros((len(data), max_len))
    labels = torch.tensor(lengths)
    for i in range(len(data)):
        index = max_len - len(sequences[i])
        sequences_new[index:] = sequences[i]
    return sequences_new, labels


def worker_init_fn(id):
    # https://github.com/pytorch/pytorch/issues/5059#issuecomment-624788105
    print(f'Initialize a worker #{id:<2}', torch.initial_seed(), flush=True)

In [None]:
# dataloader - simple
# tensorboard
# neural network - LSTM with several layers and FC in the end

# train loop
# valid loop
# test loop

In [3]:
with open('../data/transactions_train_valid_test_splits_postprocessed.pickle', 'rb') as f:
    data = cPickle.load(f)

train_data = data['train']
valid_date = data['valid']
test_data = data['test']

In [107]:
ds = SpyHunterDataset(set_data=train_data, file_name='../data/train_dataset_data.pkl')

NameError: name 'SpyHunterDataset' is not defined

In [106]:
class MyRNNNet(nn.Module):
    def __init__(self):
        super(MyRNNNet, self).__init__()


    def forward(self):
        pass

class MyTrainer:
    overfit_one_batch = False

    d_model = 512
    heads = 8

    beta1 = 0.9
    beta2 = 0.98
    eps = 10e-9

    lr = 0.01
    weight_decay = 0.0001

    warmup_steps = 4000
    # LR Adjust
    coeffs1 = 1 / (d_model ** 0.5)
    coeffs2 = 1 / (warmup_steps ** 1.5)

    global_step = 1
    check_step = False

    epochs = 100
    num_workers = 4
    pin_memory = True
    shuffle = False
    train_bsize = 128
    eval_bsize = 64
    device = 'cuda:0'

    initial_epoch = -1
    best_acc_value = -1
    best_acc_epoch = -1

    def __init__(self):
        self.rnn = MyRNNNet()
        self.opt = torch.optim.Adam(self.rnn.parameters(), lr=self.lr, weight_decay=self.weight_decay)

        
        self.valid_ds = SpyHunterDataset(valid_date, file_name='valid_dataset_data.pkl')

        self.loss_fn = nn.BCEWithLogitsLoss()

        short = datetime.now().__str__()[:-7].replace(":", "-")
        self.log_dir = f'models/Run {short}'
        assert not os.path.exists(self.log_dir)

        self.tXw = SummaryWriter(self.log_dir)
        return

    def train(self):
        train_ds = SpyHunterDataset(train_data, file_name='train_dataset_data.pkl')
        train_dloader = DataLoader(train_ds, batch_size=self.eval_bsize, shuffle=False, num_workers=1,
                             collate_fn=transactions_collate_fn, pin_memory=True, drop_last=False)
        valid_ds = SpyHunterDataset(valid_date, file_name='valid_dataset_data.pkl')
        
        for data in tqdm(dloader):
            self.opt.zero_grad()

            sequences, labels = data
            sequences = sequences.to('cuda')
            labels = labels.to('cuda')

            outputs = self.rnn(sequences)
            loss = self.loss_fn(outputs, target=labels)

            batch_balanced_acc = balanced_accuracy_score(y_true=labels.detach().numpy(),
                                                         y_pred=outputs.detach().numpy())
            self.tXw.add_scalar('Train/BatchLoss', loss.item(), self.global_step)
            self.tXw.add_scalar('Train/BalancedAcc', batch_balanced_acc, self.global_step)
            self.global_step += 1

        return

    def eval(self, eval_ds):
        # dloader = DataLoader(eval_ds, batch_size=self.eval_bsize, shuffle=False, num_workers=1,
        #                      collate_fn=transactions_collate_fn, pin_memory=True, drop_last=False)
        # for data in tqdm(dloader):
        #     self.opt.zero_grad()
        #
        #     sequences, labels = data
        #     sequences = sequences.to('cuda')
        #     labels = labels.to('cuda')
        #
        #     output = self.rnn(sequences)
        #     loss = self.loss_fn(output, target=labels)
        #
        return

    def test(self):
        self.test_ds = SpyHunterDataset(test_data, file_name='test_dataset_data.pkl')
        self.eval(self.test_ds)


    def save(self, path):
        chkpt = {
            'rnn': self.rnn.state_dict(),
            'opt': self.opt.state_dict(),
            'global_step': self.global_step,
            'best_acc_value': self.best_acc_value,
            'best_acc_epoch': self.best_acc_epoch

        }

        torch.save(chkpt, path)
        print('Saved a model to', path)
        return

    def load(self, path):
        chkpt = torch.load(path)

        self.rnn.load_state_dict(chkpt['transformer'])
        self.opt.load_state_dict(chkpt['opt'])
        self.global_step = chkpt['global_step']
        self.best_acc_value = chkpt['best_acc_value']
        self.best_acc_epoch = chkpt['best_acc_epoch']

        print('Loaded a model from', path)
        return



IndentationError: expected an indented block (223719256.py, line 3)