In [1]:
import numpy as np
import random
import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import os
import shutil
import _pickle as cPickle

import matplotlib.pyplot as plt
import plotly
from plotly import graph_objects as go

# 3. NN training.

In [144]:
class SpyHunterDataset(Dataset):
    def __init__(self, set_data: pd.DataFrame, file_name: str, mode='train'):
        super(SpyHunterDataset, self).__init__()
        # 1. Process the data or load already processed.
        # 1.1. Group by User and Sort transactions by a datetime.
        # 1.2. Drop datetime since the data is ordered. Priority to the memory consumption.
        if not os.path.exists(file_name):
            # Sort by datetime
            set_data_grouped = set_data.sort_values('Datetime')
            # Remove the column just used by not choosing it
            set_data_grouped = set_data_grouped[['User', 'Amount']]
            # Group by User
            set_data_grouped = set_data_grouped.groupby('User')
            # Get labels in nd.arrays
            set_date_clear_users = set_data[set_data['IsFraud_target'] == 0]['User'].unique()
            set_data_fraud_users = set_data[set_data['IsFraud_target'] == 1]['User'].unique()
            new_data = [set_data_grouped, set_date_clear_users, set_data_fraud_users]
            with open(file_name, 'wb') as f:
                cPickle.dump(new_data, f)
        else:
            with open(file_name, 'rb') as f:
                new_data = cPickle.load(f)

        self.set_data_grouped = new_data[0]
        self.set_date_clear_users = new_data[1]
        self.set_data_fraud_users = new_data[2]

        # 2. For train set separate Users into clear and fraud subgroups to make them equal in size,
        # for valid and test it is not required since all the users have to be classified
        # 2.1. If train, balance the clear and fraud subgroups. More detail in the self.rebalance() method.
        assert mode in ('train', 'eval')
        self.chosen_clear = self.set_date_clear_users
        self.chosen_fraud = self.set_data_fraud_users
        if mode == 'train':
            self.rebalance()
        # 3. Divide the data into N bins to make sampling, training and finally a convergence faster.
        # More details in the self.reshuflle() method.


    def rebalance(self, technique='Downsample'):
        # Since clear and fraud users are presented in an unequal amount, for stable binary prediction
        # it is required to train in a balanced more. There are some techniques to achieve it.
        # 1. Downsample: remove Users from a bigger group. Used by default.
        # 2. Upsample: duplicate Users from a smaller group.
        # 3. Resample: another technique to upsample a smaller group and downsample a bigger group.
        # 4. Other weird sampling techniques.

        if technique == 'Downsample':
            n_clear = len(self.set_date_clear_users)
            n_fraud = len(self.set_data_fraud_users)
            if n_clear < n_fraud:
                self.chosen_clear = self.set_data_fraud_users
                self.chosen_fraud = random.sample(population=self.set_data_fraud_users, k=n_clear)
            else:
                self.chosen_fraud = self.set_data_fraud_users
                self.chosen_clear = random.sample(population=self.set_date_clear_users, k=n_fraud)
        else:
            assert False
        return None

    def reshuflle(self):
        # Divide all the sequences into N bins (define by data analysis or choose any number intuitively).
        # Smaller N - more rough, may be useful during training to increase a diversity
        # of samples in a batch. Especially useful when batch size is smaller, but may reduce
        # the computation gain. For big batches, shuffling will not increase diversity since samples may
        # continue to be in one pool.
        # Higher N - better accuracy estimation, may be useful during validation
        # since the impact of zeros at the start is reduced
        return

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, item):
        # Return a sequence for a User under
        return self.groups['item']


def collate_fn():
    # 1. Handle randomization for dataloaders.
    # 2. Pad all the sequences up to len of the longest sequence in the batch.
    # 2.1. Why to pad in this method: https://discuss.pytorch.org/t/how-to-use-collate-fn/27181/4
    # 2.2. Left zero padding is preferred over right zero padding to prevent gradient vanishing,
    # seems to be equally important to the problem described in https://arxiv.org/abs/1409.3215

    return None

IndentationError: expected an indented block (592525440.py, line 46)

In [None]:
# dataset - padded sequences for all the users to allow fast batching, use collate_fn
# (first time in my practice)

# Varying batch length
# dataloader - simple
# tensorboard
# neural network - LSTM with several layers and FC in the end

# train loop
# valid loop
# test loop

In [2]:
with open('transactions_train_valid_test_splits_postprocessed.pickle', 'rb') as f:
    data = cPickle.load(f)

train_data = data['train']
valid_date = data['valid']
test_data = data['test']