# Setup

In [None]:
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from google.colab import drive
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader



np.set_printoptions(suppress=True)

Using Google Drive as a storage for the project.

In [None]:
drive.mount('/content/gdrive/')
path = 'gdrive/MyDrive/Programming Projects/Embeddings/data/'

Mounted at /content/gdrive/


CUDA acceleration (if available)

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu") 

Hyperparameters

In [None]:
#HYPERPARAMETERS

CHECKPOINT_PATH = path + "model.pt"

LOAD_PICKLED_DATASET = True

SEQUENCE_LEN = 120
SEQUENCES_PER_CLIENT = 10

EVENT_ENCODING_DIMENSIONS = 32
RNN_HIDDEN_STATE_DIMENSIONS = 64
CATEGORICAL_EMBEDDING_DIMENSIONS = 8

EPOCHS = 64
BATCH_SIZE = 512
LEARNING_RATE = 1e-4

np.random.seed(1871)

# Data processing

Defining the class for our dataset. We sample each client for TRANSACTION_PER_CLIENT transactions of length SEQUENCE_LEN and set up their respective age groups as targets. 

Subsequences are generated by choosing a random transaction in the list of client's transactions and taking it and SEQUENCE_LEN - 1 transactions after it. If we run out of transactions, the sequence is padded by zeros.

In [None]:
def subsequence_generator(sequence, pad_vector, to_generate=10, length=120):
    start_idx = np.random.randint(0, len(sequence) - 1, size=to_generate)
    ret = np.tile(pad_vector, (to_generate, length, 1))

    for i, idx in enumerate(start_idx):
        subsequence = sequence[idx:]
        ret[i, 0:subsequence.shape[0]] = subsequence[:min(len(subsequence), length)]

    return ret

In [None]:
class TransactionDataset(Dataset):
    def __init__(self, data, y, num_subsequences=10, subsequence_len=120):
        self.clients = data['client_id'].unique()
        client_target_dict = pd.Series(y.bins.values, index=y.client_id).to_dict()

        self.target = np.vectorize(client_target_dict.get)(self.clients)


        data['small_group'] = data['small_group'] + 1 # allowing us to use "0" as padding for embeddings.

        data['trans_date'] = pd.to_datetime(data['trans_date'], unit='d')
        data['day_of_week'] = data['trans_date'].dt.dayofweek + 1
        data['month'] = data['trans_date'].dt.month
        data = data.drop(columns=['trans_date'])

        data=data.reindex(columns=['client_id','small_group', 'day_of_week', 'month', 'amount_rur'])
        
        pad_sequence = np.array([0,0,0,0.])


        self.data = list()
        for i in range(len(self.clients)):
            sequence = data[data['client_id'] == self.clients[i]].drop(columns=['client_id']).to_numpy()
            self.data.append(subsequence_generator(sequence, pad_sequence, num_subsequences, subsequence_len))
        self.data = np.array(self.data)

    def __len__(self):
        return self.target.shape[0]

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

The class for encoding each individual transaction (event).

All categorical features are encoded by an embedding of size EVENT_EMBEDDING_DIMENSIONS.
All numerical features are 

In [None]:
class EventEncoder(nn.Module):
    def __init__(self, num_features, cat_features, cat_vocab_sizes, out_dim, emb_dim=16):
        super(EventEncoder, self).__init__()

        self.initial_norm = nn.BatchNorm1d(num_features, affine=False)

        self.embeddings = nn.ModuleList() 
        self.cat_features = cat_features 
        for i in range(cat_features):
            self.embeddings.append(nn.Embedding(cat_vocab_sizes[i], emb_dim, padding_idx=0))

        cur_size = emb_dim * cat_features + num_features
        self.process = nn.Sequential(
            nn.Linear(cur_size, out_dim),
            nn.Sigmoid(),
            nn.BatchNorm1d(out_dim)
        )

    def forward(self, numerical, categorical):
        concat_input = self.initial_norm(numerical)

        for i in range(self.cat_features):
            embedded_cat = self.embeddings[i](categorical[:,i])
            concat_input = torch.cat((concat_input, embedded_cat), 1)
        out = self.process(concat_input)
        
        return out

In [None]:
class SequenceEncoder(nn.Module):
    def __init__(self, num_features, cat_features, cat_vocab_sizes, sequence_len, classes, event_dim=32, hidden_size=32, emb_dim=16):

        super(SequenceEncoder, self).__init__()

        self.num_features = num_features
        self.cat_features = cat_features
        self.sequence_len = sequence_len
        self.event_dim = event_dim

        self.event_encoder = EventEncoder(num_features, cat_features, cat_vocab_sizes, event_dim, emb_dim)

        self.rnn = nn.GRU(event_dim, hidden_size, batch_first=True)

        self.classifier = nn.Linear(hidden_size, classes)
                     
    def forward(self, numerical, categorical):
        # numerical of size (batch_size, num_of_sequences, sequence_len, num_features)
        # categorical of size (batch_size, num_of_sequences, sequence_len, cat_features)

        numerical = numerical.view(-1, self.num_features)
        categorical = categorical.view(-1, self.cat_features)
        
        # receiving batch_size x num_of_sequences x sequence_len x event_dim events
        events = self.event_encoder(numerical, categorical)
        events = events.view(-1, self.sequence_len, self.event_dim)

        rnn_res = self.rnn(events)[0][:,-1,:]
        
        out = self.classifier(rnn_res)
        
        return out

In [None]:
df = pd.read_csv(path + "transactions_train.csv")
y = pd.read_csv(path + "train_target.csv")  

if LOAD_PICKLED_DATASET:
    train_dataset = pickle.load(open(path + "supervised_train_dataset.p", "rb" ))
    test_dataset = pickle.load(open(path + "supervised_test_dataset.p", "rb" ))
else:
    test_clients = np.random.choice(df.client_id.unique(), size=int(0.3 * df.client_id.nunique()), replace=False)
    train_clients = np.array(list(set(df.client_id.unique()) - set(test_clients)))
    train_idx = ~df.client_id.isin(test_clients)

    df_train = df[train_idx].copy()
    df_test = df[~train_idx].copy()
    train_dataset = TransactionDataset(df_train, 
                                    y, 
                                    num_subsequences=SEQUENCES_PER_CLIENT,
                                    subsequence_len=SEQUENCE_LEN)
    pickle.dump(train_dataset, open(path + "supervised_train_dataset.p", "wb" ) )

    test_dataset = TransactionDataset(df_test, 
                                    y, 
                                    num_subsequences=SEQUENCES_PER_CLIENT,
                                    subsequence_len=SEQUENCE_LEN)
    pickle.dump(test_dataset, open(path + "supervised_test_dataset.p", "wb" ) )


train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)


train_features, train_labels = next(iter(train_dataloader))
print(train_features.size())
print(train_labels.size())
print(len(train_dataset), len(test_dataset))

torch.Size([256, 10, 120, 4])
torch.Size([256])
21000 9000


In [None]:
train_features, train_labels = next(iter(train_dataloader))
print(train_features.size())
print(train_labels.size())
print(len(train_dataset), len(test_dataset))

torch.Size([256, 10, 120, 4])
torch.Size([256])
21000 9000


In [None]:
encoder = SequenceEncoder(num_features=1, 
                          cat_features=3, 
                          cat_vocab_sizes=[df['small_group'].max() + 2, 8, 13], # +1 for padding; + 1 to consider 0-indexing
                          sequence_len=SEQUENCE_LEN, 
                          classes=y.bins.nunique(), 
                          event_dim=EVENT_ENCODING_DIMENSIONS,
                          hidden_size=RNN_HIDDEN_STATE_DIMENSIONS,
                          emb_dim=CATEGORICAL_EMBEDDING_DIMENSIONS)

encoder.to(device)

optimizer = torch.optim.AdamW(encoder.parameters(), lr=LEARNING_RATE)

loss_func = nn.CrossEntropyLoss()

categorical_mask = [True, True, True, False]

In [None]:
train_loss_history = list()
val_loss_history = list()

train_accuracy_history = list()
val_accuracy_history = list()


for epoch in tqdm(range(EPOCHS)):

    encoder.train()
    cur_losses = list()
    cur_accuracy = list()

    for batch_idx, (sequences, labels) in enumerate(train_dataloader):
        numerical = sequences[:,:,:,np.logical_not(categorical_mask)].to(device)
        categorical = sequences[:,:,:,categorical_mask].to(device)
        
        labels = np.repeat(labels, SEQUENCES_PER_CLIENT).to(device)

        embeddings = encoder(numerical.float(), categorical.int()).to(device)
        train_loss = loss_func(embeddings, labels)

        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        cur_losses.append(train_loss.item())
        cur_accuracy.append(accuracy_score(torch.argmax(embeddings.cpu(), dim=1), labels.cpu()))

    train_loss_history.append(np.mean(cur_losses))
    train_accuracy_history.append(np.mean(cur_accuracy))

    encoder.eval()
    cur_losses = list()
    cur_accuracy = list()

    for batch_idx, (sequences, labels) in enumerate(test_dataloader):
        numerical = sequences[:,:,:,np.logical_not(categorical_mask)].to(device)
        categorical = sequences[:,:,:,categorical_mask].to(device)
        
        labels = np.repeat(labels, SEQUENCES_PER_CLIENT).to(device)

        with torch.no_grad():
            embeddings = encoder(numerical.float(), categorical.int()).to(device)
            val_loss = loss_func(embeddings, labels)

            cur_losses.append(val_loss.item())
            cur_accuracy.append(accuracy_score(torch.argmax(embeddings.cpu(), dim=1), labels.cpu()))

    
    val_loss_history.append(np.mean(cur_losses))
    val_accuracy_history.append(np.mean(cur_accuracy))

    print("\nEpoch {}\n Train Loss = {}\n Train Accuracy = {}\n Validation Loss = {}\n Validation Accuracy = {}".format(
        epoch, train_loss_history[-1], train_accuracy_history[-1], val_loss_history[-1], val_accuracy_history[-1]))

    torch.save({
            'epoch': epoch,
            'model_state_dict': encoder.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss_history[-1],
            }, CHECKPOINT_PATH)



  0%|          | 0/64 [00:00<?, ?it/s][A[A

  2%|▏         | 1/64 [06:22<6:41:35, 382.47s/it][A[A


Epoch 0
 Train Loss = 1.383726446025343
 Train Accuracy = 0.2761342243975904
 Validation Loss = 1.3762504988246493
 Validation Accuracy = 0.29597873263888885




  3%|▎         | 2/64 [12:43<6:34:50, 382.10s/it][A[A


Epoch 1
 Train Loss = 1.3704510036721287
 Train Accuracy = 0.308179593373494
 Validation Loss = 1.365708480278651
 Validation Accuracy = 0.31115234374999995




  5%|▍         | 3/64 [19:03<6:27:54, 381.55s/it][A[A


Epoch 2
 Train Loss = 1.358531272554972
 Train Accuracy = 0.32190794427710845
 Validation Loss = 1.3480697439776526
 Validation Accuracy = 0.3309917534722222




  6%|▋         | 4/64 [25:10<6:17:10, 377.18s/it][A[A


Epoch 3
 Train Loss = 1.3351142104849758
 Train Accuracy = 0.34306758283132527
 Validation Loss = 1.309004889594184
 Validation Accuracy = 0.35876953125




  8%|▊         | 5/64 [31:32<6:12:03, 378.36s/it][A[A


Epoch 4
 Train Loss = 1.2776587124330452
 Train Accuracy = 0.38744823042168675
 Validation Loss = 1.2330971989366744
 Validation Accuracy = 0.4122395833333333


In [None]:
plt.figure(figsize=(20,13))

plt.plot(train_loss_history, label='Training loss')
plt.plot(val_loss_history, label='Validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(20,13))

plt.plot(train_accuracy_history, label='Training accuracy')
plt.plot(val_accuracy_history, label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()