In [None]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from google.colab import drive
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader


np.set_printoptions(suppress=True)

In [None]:
drive.mount('/content/gdrive/')

KeyboardInterrupt: ignored

In [None]:
path = 'gdrive/MyDrive/Programming Projects/Embeddings/data/'

df = pd.read_csv(path + "transactions_train.csv")
y = pd.read_csv(path + "train_target.csv")  

In [None]:
#HYPERPARAMETERS

SEQUENCE_LEN = 120
TRANSACTIONS_PER_CLIENT = 10

EVENT_ENCODING_DIMENSIONS = 32
RNN_HIDDEN_STATE_DIMENSIONS = 32
EMBEDDING_DIMENSIONS = 16

EPOCHS = 32
BATCH_SIZE = 644
LEARNING_RATE = 1e-3

CHECKPOINT_PATH = path + "model.pt"

In [None]:
def subsequence_generator1(sequence, to_generate=10):
    idx = np.random.randint(0, to_generate, size=sequence)
    ret = list()

    for i in to_generate:
        subsequence = sequence[idx==i]
        ret.append(subsequence)

    return ret


def subsequence_generator2(sequence, pad_vector, to_generate=10, length=120):
    start_idx = np.random.randint(0, len(sequence) - 1, size=to_generate)
    ret = np.tile(pad_vector, (to_generate, length, 1))

    for i, idx in enumerate(start_idx):
        subsequence = sequence[idx:]
        ret[i, 0:subsequence.shape[0]] = subsequence[:min(len(subsequence), length)]

    return ret

In [None]:
class TransactionDataset(Dataset):
    def __init__(self, data, y, num_subsequences=10, subsequence_len=120):
        self.clients = data['client_id'].unique()
        
        client_target_dict = pd.Series(y.bins.values, index=y.client_id).to_dict()
        self.target = np.vectorize(client_target_dict.get)(self.clients)

        pad_sequence = np.array([0,0,0.])

        data['small_group'] = data['small_group'] + 1 # allowing us to use "0" as padding

        self.data = list()
        for i in range(len(self.clients)):
            sequence = data[data['client_id'] == self.clients[i]].drop(columns=['client_id']).to_numpy()
            self.data.append(subsequence_generator2(sequence, pad_sequence, num_subsequences, subsequence_len))
        self.data = np.array(self.data)

    def __len__(self):
        return self.target.shape[0]

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

In [None]:
class EventEncoder(nn.Module):
    def __init__(self, num_features, cat_features, cat_vocab_sizes, out_dim, emb_dim=16):
        super(EventEncoder, self).__init__()

        self.initial_norm = nn.BatchNorm1d(num_features, affine=False)

        # len(nn.ModuleList) is a bit wacky, so we'll store the size just in case
        self.embeddings = nn.ModuleList() 
        self.cat_features = cat_features 
        for i in range(cat_features):
            self.embeddings.append(nn.Embedding(cat_vocab_sizes[i], emb_dim, padding_idx=0))

        cur_size = emb_dim * cat_features + num_features
        self.process1 = nn.Sequential(
            nn.Linear(cur_size, 32),
            nn.Sigmoid(),
            nn.BatchNorm1d(32),
        )
        self.process2 = nn.Sequential(
            nn.Linear(32, out_dim),
            nn.Sigmoid(),
            nn.BatchNorm1d(out_dim),
        )

    def forward(self, numerical, categorical):
        concat_input = self.initial_norm(numerical)

        for i in range(self.cat_features):
            embedded_cat = self.embeddings[i](categorical[:,i])
            concat_input = torch.cat((concat_input, embedded_cat), 1)
        out = self.process1(concat_input)
        out = self.process2(out)
        
        return out

In [None]:
class SequenceEncoder(nn.Module):
    def __init__(self, num_features, cat_features, cat_vocab_sizes, sequence_len, classes, event_dim=32, hidden_size=32, emb_dim=16):

        super(SequenceEncoder, self).__init__()

        self.num_features = num_features
        self.cat_features = cat_features
        self.sequence_len = sequence_len
        self.event_dim = event_dim

        self.event_encoder = EventEncoder(num_features, cat_features, cat_vocab_sizes, event_dim, emb_dim=emb_dim)

        self.rnn = nn.GRU(event_dim, hidden_size, batch_first=True)

        self.classifier = nn.Linear(hidden_size, classes)
                     
    def forward(self, numerical, categorical):
        # numerical of size (batch_size, num_of_sequences, sequence_len, num_features)
        # categorical of size (batch_size, num_of_sequences, sequence_len, cat_features)

        numerical = numerical.view(-1, self.num_features)
        categorical = categorical.view(-1, self.cat_features)
        
        # receiving batch_size x num_of_sequences x sequence_len x event_dim events
        events = self.event_encoder(numerical, categorical)
        events = events.view(-1, self.sequence_len, self.event_dim)

        rnn_res = self.rnn(events)[0][:,-1,:]
        
        out = self.classifier(rnn_res)
        
        return out

In [None]:
# takes quite a long time, avoid restarting if possible
train_dataset = TransactionDataset(df, 
                                   y, 
                                   num_subsequences=TRANSACTIONS_PER_CLIENT,
                                   subsequence_len=SEQUENCE_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

train_features, train_labels = next(iter(train_dataloader))
print(train_features.size())
print(train_labels.size())

KeyboardInterrupt: ignored

In [None]:
encoder = SequenceEncoder(num_features=2, 
                          cat_features=1, 
                          cat_vocab_sizes=[df['small_group'].max() + 1], # +1 for padding 
                          sequence_len=SEQUENCE_LEN, 
                          classes=y.bins.nunique(), 
                          event_dim=EVENT_ENCODING_DIMENSIONS, 
                          hidden_size=RNN_HIDDEN_STATE_DIMENSIONS,
                          emb_dim=EMBEDDING_DIMENSIONS)

optimizer = torch.optim.AdamW(encoder.parameters(), lr=LEARNING_RATE)

loss_func = nn.CrossEntropyLoss()

In [None]:
encoder.train()

categorical_mask = [False, True, False]


for epoch in tqdm(range(EPOCHS)):
    for batch_idx, (sequences, labels) in enumerate(train_dataloader):
        
        numerical = sequences[:,:,:,np.logical_not(categorical_mask)]
        categorical = sequences[:,:,:,categorical_mask]

        embeddings = encoder(numerical.float(), categorical.int())
        # print(torch.argmax(embeddings, dim=1))

        labels = np.repeat(labels, 10)


        loss = loss_func(embeddings, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


    print("Epoch {}; Loss = {}; Train Accuracy = {}".format(epoch, loss.item(), accuracy_score(torch.argmax(embeddings, dim=1), labels)))

    torch.save({
            'epoch': epoch,
            'model_state_dict': encoder.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, CHECKPOINT_PATH)

  3%|▎         | 1/32 [07:54<4:05:01, 474.25s/it]

Epoch 0; Loss = 1.3141871690750122; Accuracy = 0.37101063829787234


  6%|▋         | 2/32 [15:53<3:57:50, 475.68s/it]

Epoch 1; Loss = 1.2466553449630737; Accuracy = 0.4103723404255319


  9%|▉         | 3/32 [23:51<3:50:21, 476.59s/it]

Epoch 2; Loss = 1.1837794780731201; Accuracy = 0.44574468085106383


 12%|█▎        | 4/32 [31:52<3:42:59, 477.84s/it]

Epoch 3; Loss = 1.1774318218231201; Accuracy = 0.4422872340425532


 16%|█▌        | 5/32 [40:00<3:36:20, 480.76s/it]

Epoch 4; Loss = 1.1488063335418701; Accuracy = 0.4853723404255319


 19%|█▉        | 6/32 [48:08<3:29:20, 483.09s/it]

Epoch 5; Loss = 1.0911097526550293; Accuracy = 0.498936170212766


 22%|██▏       | 7/32 [56:16<3:21:52, 484.48s/it]

Epoch 6; Loss = 1.085792064666748; Accuracy = 0.5058510638297873


 25%|██▌       | 8/32 [1:04:30<3:14:59, 487.47s/it]

Epoch 7; Loss = 1.07914400100708; Accuracy = 0.511968085106383


 28%|██▊       | 9/32 [1:12:37<3:06:43, 487.11s/it]

Epoch 8; Loss = 1.1432442665100098; Accuracy = 0.4867021276595745


 31%|███▏      | 10/32 [1:20:33<2:57:26, 483.95s/it]

Epoch 9; Loss = 1.04641854763031; Accuracy = 0.5159574468085106


 34%|███▍      | 11/32 [1:28:29<2:48:32, 481.55s/it]

Epoch 10; Loss = 1.0724250078201294; Accuracy = 0.49361702127659574


 38%|███▊      | 12/32 [1:36:23<2:39:45, 479.30s/it]

Epoch 11; Loss = 1.0559850931167603; Accuracy = 0.5375


 41%|████      | 13/32 [1:44:21<2:31:34, 478.66s/it]

Epoch 12; Loss = 1.0522383451461792; Accuracy = 0.5212765957446809


 44%|████▍     | 14/32 [1:52:15<2:23:10, 477.27s/it]

Epoch 13; Loss = 1.0679794549942017; Accuracy = 0.4976063829787234


 47%|████▋     | 15/32 [2:00:07<2:14:50, 475.92s/it]

Epoch 14; Loss = 1.0492112636566162; Accuracy = 0.5377659574468086


 50%|█████     | 16/32 [2:08:03<2:06:54, 475.91s/it]

Epoch 15; Loss = 0.9928312301635742; Accuracy = 0.5311170212765958


 53%|█████▎    | 17/32 [2:15:57<1:58:50, 475.36s/it]

Epoch 16; Loss = 1.0354695320129395; Accuracy = 0.5226063829787234


 56%|█████▋    | 18/32 [2:23:48<1:50:35, 473.95s/it]

Epoch 17; Loss = 1.0117847919464111; Accuracy = 0.5412234042553191


 59%|█████▉    | 19/32 [2:31:41<1:42:37, 473.63s/it]

Epoch 18; Loss = 0.9731290340423584; Accuracy = 0.5571808510638298


 62%|██████▎   | 20/32 [2:39:36<1:34:50, 474.25s/it]

Epoch 19; Loss = 1.0569545030593872; Accuracy = 0.5029255319148936


 66%|██████▌   | 21/32 [2:47:36<1:27:15, 475.93s/it]

Epoch 20; Loss = 1.022054672241211; Accuracy = 0.5393617021276595


 69%|██████▉   | 22/32 [2:55:34<1:19:23, 476.31s/it]

Epoch 21; Loss = 1.036792278289795; Accuracy = 0.5305851063829787


 72%|███████▏  | 23/32 [3:03:24<1:11:10, 474.47s/it]

Epoch 22; Loss = 1.056113362312317; Accuracy = 0.5287234042553192


 75%|███████▌  | 24/32 [3:11:15<1:03:07, 473.45s/it]

Epoch 23; Loss = 1.0716748237609863; Accuracy = 0.5234042553191489


 78%|███████▊  | 25/32 [3:19:07<55:10, 472.96s/it]  

Epoch 24; Loss = 0.9712480902671814; Accuracy = 0.5664893617021277


 81%|████████▏ | 26/32 [3:27:00<47:17, 473.00s/it]

Epoch 25; Loss = 1.0298559665679932; Accuracy = 0.5257978723404255


 84%|████████▍ | 27/32 [3:34:50<39:21, 472.22s/it]

Epoch 26; Loss = 1.0214648246765137; Accuracy = 0.5441489361702128


 88%|████████▊ | 28/32 [3:42:44<31:31, 472.75s/it]

Epoch 27; Loss = 1.0362318754196167; Accuracy = 0.5268617021276596


 91%|█████████ | 29/32 [3:50:37<23:38, 472.72s/it]

Epoch 28; Loss = 0.9957727789878845; Accuracy = 0.55


 94%|█████████▍| 30/32 [3:58:32<15:46, 473.49s/it]

Epoch 29; Loss = 0.9414615035057068; Accuracy = 0.5856382978723405


 97%|█████████▋| 31/32 [4:06:32<07:55, 475.37s/it]

Epoch 30; Loss = 1.0375800132751465; Accuracy = 0.5218085106382979


100%|██████████| 32/32 [4:14:27<00:00, 477.12s/it]

Epoch 31; Loss = 1.014672875404358; Accuracy = 0.5377659574468086





In [None]:
 print(df.small_group.nunique())

202
