I tried creating transformer/MHA attention models using Pytorch for Kaggle's [riiid](https://www.kaggle.com/c/riiid-test-answer-prediction) and [indoor-location-navigation](https://www.kaggle.com/c/indoor-location-navigation) competitions. Here are the links to the notebooks:

* https://www.kaggle.com/npa02012/location-floor-mha
* https://www.kaggle.com/npa02012/location-xy-mha
* https://www.kaggle.com/npa02012/riiid-model-mha
* https://www.kaggle.com/npa02012/riiid-model-transformer

In case Kaggle purges these notebooks, I'm copying the content below:  

#### location-floor-mha

In [None]:
# https://github.com/location-competition/indoor-location-competition-20
# https://www.kaggle.com/c/indoor-location-navigation/data
# https://www.kaggle.com/titericz/eda-loading-data-and-visualizing-paths
# https://www.kaggle.com/npa02012/time-to-complete-trace-eda
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

# www.reddit.com/r/MachineLearning/comments/4dzxs3/best_way_to_deal_with_time_series_data
# https://arxiv.org/pdf/1907.03907.pdf
# https://github.com/YuliaRubanova/latent_ode

import os
import glob
import numpy as np
import pandas as pd
import torch
import tqdm
import matplotlib
import time
import pickle

settings = {
    'beacon_seq_len' : 50
    ,'beacon_embed_dim' : 256
    ,'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ,'floor_mapping' : {'B1' : -1, 'B2' : -2, 'B3' : -3
                         ,'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4
                         ,'F6' : 5, 'F7' : 6 , 'F8' : 7, 'F9' : 8, 'F10' : 9
                         ,'1F' : 0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4
                         ,'6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8
                        }
    ,'n_floor' : 13
    ,'max_beacon_distance' : 200
    ,'path_train' : '../input/indoor-location-navigation/train/*/*/*'
    ,'path_test' : '../input/indoor-location-navigation/test/*'
    ,'path_sample' : '../input/indoor-location-navigation/sample_submission.csv'
}

## ---

with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    train = pickle.load(handle)
    
# Delete rows with 'unusual' floors
i = train['df'][~train['df']['floor'].isin(settings['floor_mapping'].keys())].index
train['df'].drop(i, inplace=True)

# Only keep initial rows of a trace
train['df'] = train['df'].groupby('trace_id')\
                .head(settings['beacon_seq_len']).reset_index(drop=True)

# Convert distance column to int
tmp = settings['max_beacon_distance']
train['df']['distance'] = np.where(train['df']["distance"]>tmp, tmp, train['df']["distance"])
train['df']['distance'] = np.where(train['df']["distance"]<0, 0, train['df']["distance"])
train['df']['distance'] = train['df']['distance'].astype(int) + 1

# Map columns
train['df']['site_id'] = train['df']['site_id'].astype('category').cat.codes + 1
train['df']['UUID'] = train['df']['UUID'].astype('category').cat.codes + 1
train['df']['MinorID'] = train['df']['MinorID'].astype('category').cat.codes + 1
train['df']['MajorID'] = train['df']['MajorID'].astype('category').cat.codes + 1
train['df']['MAC_Address'] = train['df']['MAC_Address'].astype('category').cat.codes + 1
train['df'].replace({'floor' : settings['floor_mapping']}, inplace=True)

# Record settings
settings['n_uuids'] = train['df']['UUID'].max() + 1
settings['n_minor_ids'] = train['df']['MinorID'].max() + 1
settings['n_major_ids'] = train['df']['MajorID'].max() + 1
settings['n_macs'] = train['df']['MAC_Address'].max() + 1
settings['n_sites'] = train['df']['site_id'].max() + 1

# Convert to dictionary
train['df'] = {k: table for k, table in train['df'].groupby("trace_id")} # slower, but easier

train['df']['5d09b22fcfb49b00085466a0']

# Make validation set
np.random.seed(1)
val_idx = np.random.choice(list(train['df'].keys())
                           ,int(.2 * len(train['df'].keys())), replace=False)
valid = {'df' : {}}
for i in val_idx:
    valid['df'][i] = train['df'][i].copy()
    del train['df'][i]
    
class beacon_dataset(torch.utils.data.Dataset):
    
    def __init__(self, group, settings):
        super(beacon_dataset, self).__init__()
        self.beacon_seq_len = settings['beacon_seq_len']
        self.n_floor = settings['n_floor']
        self.group = group
        self.trace_ids = list(group.keys())
        
    def __len__(self):
        return(len(self.trace_ids))
    
    def __getitem__(self, index):
        # Get the relevant user row
        sample = self.group[self.trace_ids[index]]
        
        # Get contents as np.int64s
        uuids = sample['UUID'].values
        distances = sample['distance'].values
        minor_ids = sample['MinorID'].values
        major_ids = sample['MajorID'].values
        macs = sample['MAC_Address'].values
        sites = sample['site_id'].values
        
        # Pad if needed
        n_pad = self.beacon_seq_len - len(uuids)
        if n_pad > 0:
            uuids = np.concatenate((uuids, np.full(n_pad, 0).astype(np.int64)))
            distances = np.concatenate((distances, np.full(n_pad, 0).astype(np.int64)))
            minor_ids = np.concatenate((minor_ids, np.full(n_pad, 0).astype(np.int64))) 
            major_ids = np.concatenate((major_ids, np.full(n_pad, 0).astype(np.int64)))
            macs = np.concatenate((macs, np.full(n_pad, 0).astype(np.int64)))
            sites = np.concatenate((sites, np.full(n_pad, 0).astype(np.int64)))
        else:
            uuids = uuids[:self.beacon_seq_len]
            distances = distances[:self.beacon_seq_len]
            minor_ids = minor_ids[:self.beacon_seq_len]
            major_ids = major_ids[:self.beacon_seq_len]
            macs = macs[:self.beacon_seq_len]
            sites = sites[:self.beacon_seq_len]

        
        # Return
        return({
            'floor' : sample.iloc[0]['floor'] + 3#np.array(floor)
            ,'uuids' : uuids
            ,'distances' : distances
            ,'minor_ids' : minor_ids
            ,'major_ids' : major_ids
            ,'macs' : macs
            ,'sites' : sites
        })
    

train_dataset = beacon_dataset(group = train['df']
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = 128
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = beacon_dataset(group = valid['df']
                              ,settings = settings
                              )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                                ,batch_size = 10
                                                ,num_workers = 4
                                               )

valid_dataset.__getitem__(1)

class floor_model(torch.nn.Module):
    def __init__(self, settings):
        super(floor_model, self).__init__()
        self.embed_dim = settings['beacon_embed_dim']
        self.seq_len = settings['beacon_seq_len']
        self.device = settings['device']
        
        self.minor_id_embedding = torch.nn.Embedding(settings['n_minor_ids']
                                                    ,self.embed_dim)
        self.major_id_embedding = torch.nn.Embedding(settings['n_major_ids']
                                                    ,self.embed_dim)
        self.uuid_embedding = torch.nn.Embedding(settings['n_uuids']
                                                 ,self.embed_dim)
        self.mac_embedding = torch.nn.Embedding(settings['n_macs']
                                                 ,self.embed_dim)
        # Site embedding doesn't make to much sense (same site always)
        self.site_embedding = torch.nn.Embedding(settings['n_sites']
                                                ,self.embed_dim)
        self.distance_embedding = torch.nn.Embedding(settings['max_beacon_distance']+2
                                                     ,self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 4
                                                     ,dropout = 0.2)

        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, 1)
        self.dropout = torch.nn.Dropout(0.2)
        
        self.pred = torch.nn.Linear(self.seq_len, settings['n_floor'])
        
        self.tmp = True
            
    def forward(self, batch):        
        # Minor id embedding
        x = self.minor_id_embedding(batch['minor_ids'].long())
        
        # MAC Address embedding
        x = x + self.mac_embedding(batch['macs'].long())
        
        # Site embedding
        x = x + self.site_embedding(batch['sites'].long())
        
        # Major Id embedding
        x = x + self.major_id_embedding(batch['major_ids'].long())
        
        # UUID embedding
        #x = x + self.uuid_embedding(batch['uuids'].long())
        
        # Distance embedding
        x = x + self.distance_embedding(batch['distances'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        x = x + self.pos_embedding(pos_id)
        
        # Permute
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention and permute back
        attn_output, _ = self.multi_att(x, x, x)
        x = x + attn_output
        x = x.permute(1, 0, 2)
        
        # Feed forward
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        x = self.dropout(x)
        
        # Predict
        x = x[:, :, -1]
        x = self.pred(x)
        
        # Return
        return(x)
        

# Setup model, optimizer and criterion
model = floor_model(settings)
optimizer = torch.optim.Adam(model.parameters(), lr=.002)
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.CrossEntropyLoss()
all_auc = []

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])
all_loss = []


for _ in range(25):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch['floor'].long())
        loss.backward()
        optimizer.step()
        
        # Record metrics
        all_loss.append(loss.item())

print(np.array(all_loss[-200:]).mean())
matplotlib.pyplot.plot(all_loss)
matplotlib.pyplot.show()

def get_acc(dataset):
    if dataset == 'valid':
        dl = valid_dataloader
    else:
        dl = train_dataloader
    # Accuracy mesurments
    preds = np.empty(0, dtype=np.int64)
    labels = np.empty(0, dtype=np.int64)

    model.eval()
    for batch in dl:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])

        # Get predictions
        pred = model(batch)
        p = pred.detach().to('cpu').numpy()
        p = np.argmax(p, axis = 1)
        preds = np.concatenate((preds, p))

        # Label
        l = batch['floor'].detach().to('cpu').numpy()
        labels = np.concatenate((labels, l))
    model.train()
    print(dataset)
    print(np.sum(preds == labels)/preds.shape[0])
    print((15 * abs(preds - labels)).mean())
    
    
# Get accuracy
get_acc('valid')
get_acc('train')

#### location-xy-mha

In [None]:
#
# Continuous embedding:
#  https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/210171
#  https://github.com/dkletran/riiid-challenge-4th-place/blob/main/modeling_training/modeling.py
#  https://arxiv.org/pdf/2010.12042.pdf
#

import os
import glob
import numpy as np
import pandas as pd
import torch
import tqdm
import matplotlib
import time
import pickle

settings = {
    'beacon_seq_len' : 50
    ,'beacon_embed_dim' : 256
    ,'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ,'floor_mapping' : {'B1' : -1, 'B2' : -2, 'B3' : -3
                         ,'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4
                         ,'F6' : 5, 'F7' : 6 , 'F8' : 7, 'F9' : 8, 'F10' : 9
                         ,'1F' : 0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4
                         ,'6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8
                        }
    ,'n_floor' : 13
    ,'max_beacon_distance' : 200
    ,'path_train' : '../input/indoor-location-navigation/train/*/*/*'
    ,'path_test' : '../input/indoor-location-navigation/test/*'
    ,'path_sample' : '../input/indoor-location-navigation/sample_submission.csv'
}

with open('../input/location-data/wp_train.pkl', 'rb') as handle:
    train = pickle.load(handle)
    
with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    tmp = pickle.load(handle)
    
# Handle this in other script eventually
train['wp'] = train.pop('df')
train['beacon'] = tmp.pop('df')
train['wp'].x = train['wp'].x.astype('float')
train['wp'].y = train['wp'].y.astype('float')


with open('../input/location-data/wp_train.pkl', 'rb') as handle:
    train = pickle.load(handle)
    
with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    tmp = pickle.load(handle)
    
# Handle this in other script eventually
train['wp'] = train.pop('df')
train['beacon'] = tmp.pop('df')
train['wp'].x = train['wp'].x.astype('float')
train['wp'].y = train['wp'].y.astype('float')

#
# Delete trace_ids not in beacon
# ***** Delete this eventually
#
train['wp'] = train['wp'].loc[train['wp'].trace_id.isin(train['beacon'].trace_id)]

# Record settings for embedding
settings['n_uuids'] = train['beacon']['UUID'].nunique() + 1
settings['n_minor_ids'] = train['beacon']['MinorID'].nunique() + 1
settings['n_major_ids'] = train['beacon']['MajorID'].nunique() + 1
settings['n_macs'] = train['beacon']['MAC_Address'].nunique() + 1
settings['n_sites'] = train['beacon']['site_id'].nunique() + 1

#
# Make validation set
#

# Get trace_ids to be in validation set
np.random.seed(1)
tmp = train['wp'].trace_id.unique()
valid_ids = np.random.choice(tmp, int(.2 * tmp.shape[0]), replace=False)

# Make validation set
valid = {}
valid['beacon'] = train['beacon'].loc[train['beacon'].trace_id.isin(valid_ids)]
valid['wp'] = train['wp'].loc[train['wp'].trace_id.isin(valid_ids)]

# Delete validation set from train
train['beacon'] = train['beacon'].loc[~train['beacon'].trace_id.isin(valid_ids)]
train['wp'] = train['wp'].loc[~train['wp'].trace_id.isin(valid_ids)]

def clean_data(data, settings):
    
    # Delete rows with 'non-testing' floors
    i = data['wp'][~data['wp']['floor'].isin(settings['floor_mapping'].keys())].index
    data['wp'].drop(i, inplace=True)
    i = data['beacon'][~data['beacon']['floor']\
                        .isin(settings['floor_mapping'].keys())].index
    data['beacon'].drop(i, inplace=True)
    
    # Reset indices
    data['wp'].reset_index(drop=True, inplace=True)
    data['beacon'].reset_index(drop=True, inplace=True)

    # Only keep initial rows of a trace
    data['beacon'] = data['beacon'].groupby('trace_id')\
                    .head(settings['beacon_seq_len']).reset_index(drop=True)

    # Convert distance column to int
    tmp = settings['max_beacon_distance']
    data['beacon']['distance'] = np.where(data['beacon']["distance"] > tmp
                                       ,tmp, data['beacon']["distance"])
    data['beacon']['distance'] = np.where(data['beacon']["distance"] < 0
                                       ,0, data['beacon']["distance"])
    data['beacon']['distance'] = data['beacon']['distance'].astype(int) + 1

    # Map columns
    data['beacon']['site_id'] = data['beacon']['site_id'].astype('category').cat.codes + 1
    data['beacon']['UUID'] = data['beacon']['UUID'].astype('category').cat.codes + 1
    data['beacon']['MinorID'] = data['beacon']['MinorID'].astype('category').cat.codes + 1
    data['beacon']['MajorID'] = data['beacon']['MajorID'].astype('category').cat.codes + 1
    data['beacon']['MAC_Address'] = data['beacon']['MAC_Address'].astype('category').cat.codes + 1
    data['beacon'].replace({'floor' : settings['floor_mapping']}, inplace=True)

    # Convert to dictionary
    data['beacon'] = {k: table for k, table in data['beacon'].groupby("trace_id")} # Can optimize
    data['wp'] = data['wp'].to_dict(orient='index')
    
    # Return
    return(data)

train = clean_data(train, settings)
valid = clean_data(valid, settings)

print(train['wp'][0])
valid['beacon'][list(valid['beacon'].keys())[0]].head()

class location_dataset(torch.utils.data.Dataset):
    
    def __init__(self, data, settings):
        super(location_dataset, self).__init__()
        self.beacon_seq_len = settings['beacon_seq_len']
        self.n_floor = settings['n_floor']
        self.data = data
        
    def __len__(self):
        return(len(self.data['wp'].keys()))
    
    def __getitem__(self, index):
        # Get the relevant user data
        wp = self.data['wp'][index]
        beacon = self.data['beacon'][wp['trace_id']]
        
        # Get contents as np.int64s
        uuids = beacon['UUID'].values
        distances = beacon['distance'].values
        minor_ids = beacon['MinorID'].values
        major_ids = beacon['MajorID'].values
        macs = beacon['MAC_Address'].values
        sites = beacon['site_id'].values
        
        # Pad if needed
        n_pad = self.beacon_seq_len - len(uuids)
        if n_pad > 0:
            uuids = np.concatenate((uuids, np.full(n_pad, 0).astype(np.int64)))
            distances = np.concatenate((distances, np.full(n_pad, 0).astype(np.int64)))
            minor_ids = np.concatenate((minor_ids, np.full(n_pad, 0).astype(np.int64))) 
            major_ids = np.concatenate((major_ids, np.full(n_pad, 0).astype(np.int64)))
            macs = np.concatenate((macs, np.full(n_pad, 0).astype(np.int64)))
            sites = np.concatenate((sites, np.full(n_pad, 0).astype(np.int64)))
        else:
            uuids = uuids[:self.beacon_seq_len]
            distances = distances[:self.beacon_seq_len]
            minor_ids = minor_ids[:self.beacon_seq_len]
            major_ids = major_ids[:self.beacon_seq_len]
            macs = macs[:self.beacon_seq_len]
            sites = sites[:self.beacon_seq_len]
            
        # Location of waypoint timestamp
        #tmp = train['beacon']['5d09b23ccfb49b00085466a6'].timestamp
        #print(tmp)
        #tmp2 = tmp.loc[tmp > 15609164469999].index.min()

        
        # Return
        return({
            'x' : wp['x']
            ,'y' : wp['y']
            ,'uuids' : uuids
            ,'distances' : distances
            ,'minor_ids' : minor_ids
            ,'major_ids' : major_ids
            ,'macs' : macs
            ,'sites' : sites
        })
    

train_dataset = location_dataset(data = train
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = 256
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = location_dataset(data = valid
                              ,settings = settings
                              )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                                ,batch_size = 1000
                                                ,num_workers = 4
                                               )
valid_dataset.__getitem__(1)

class xy_model(torch.nn.Module):
    def __init__(self, settings):
        super(xy_model, self).__init__()
        self.embed_dim = settings['beacon_embed_dim']
        self.seq_len = settings['beacon_seq_len']
        self.device = settings['device']
        
        self.minor_id_embedding = torch.nn.Embedding(settings['n_minor_ids']
                                                    ,self.embed_dim)
        self.major_id_embedding = torch.nn.Embedding(settings['n_major_ids']
                                                    ,self.embed_dim)
        self.uuid_embedding = torch.nn.Embedding(settings['n_uuids']
                                                 ,self.embed_dim)
        self.mac_embedding = torch.nn.Embedding(settings['n_macs']
                                                 ,self.embed_dim)
        # Site embedding doesn't make to much sense (same site always)
        self.site_embedding = torch.nn.Embedding(settings['n_sites']
                                                ,self.embed_dim)
        self.distance_embedding = torch.nn.Embedding(settings['max_beacon_distance']+2
                                                     ,self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 4
                                                     ,dropout = 0.2)

        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, 1)
        self.dropout = torch.nn.Dropout(0.2)
        
        self.pred = torch.nn.Linear(self.seq_len, 2)
        
            
    def forward(self, batch):        
        # Minor id embedding
        x = self.minor_id_embedding(batch['minor_ids'].long())
        
        # MAC Address embedding
        x = x + self.mac_embedding(batch['macs'].long())
        
        # Site embedding
        x = x + self.site_embedding(batch['sites'].long())
        
        # Major Id embedding
        x = x + self.major_id_embedding(batch['major_ids'].long())
        
        # UUID embedding
        #x = x + self.uuid_embedding(batch['uuids'].long())
        
        # Distance embedding
        x = x + self.distance_embedding(batch['distances'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        x = x + self.pos_embedding(pos_id)
        
        # Permute
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention and permute back
        attn_output, _ = self.multi_att(x, x, x)
        x = x + attn_output
        x = x.permute(1, 0, 2)
        
        # Feed forward
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        x = self.dropout(x)
        
        # Predict
        x = x[:, :, -1]
        x = self.pred(x)
        
        # Return
        return(x)
        

# Setup model, optimizer and criterion
model = xy_model(settings)
optimizer = torch.optim.Adam(model.parameters(), lr=.002)
criterion = torch.nn.MSELoss()

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])
all_loss = []

for _ in range(1):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        targ = torch.cat((batch['x'][:, None], batch['y'][:, None]), 1).float()
        loss = criterion(pred, targ)
        loss.backward()
        optimizer.step()
        
        # Record metrics
        all_loss.append(loss.item())

print(np.array(all_loss[-200:]).mean())
matplotlib.pyplot.plot(all_loss)
matplotlib.pyplot.show()

def score(dataset, settings):
    if dataset == 'valid':
        dl = valid_dataloader
    else:
        dl = torch.utils.data.DataLoader(train_dataset
                                        ,batch_size = 1000
                                        ,num_workers = 4
                                       )
    # Accuracy mesurments
    pred_x = np.empty(0, dtype=np.float)
    pred_y = np.empty(0, dtype=np.float)
    targ_x = np.empty(0, dtype=np.float)
    targ_y = np.empty(0, dtype=np.float)

    model.eval()
    for batch in dl:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])

        # Get predictions
        pred = model(batch)
        p = pred.detach().to('cpu').numpy()
        p_x = p[:, 0]
        p_y = p[:, 1]
        pred_x = np.concatenate((pred_x, p_x))
        pred_y = np.concatenate((pred_y, p_y))

        # Target
        t_x = batch['x'].detach().to('cpu').numpy()
        t_y = batch['x'].detach().to('cpu').numpy()
        targ_x = np.concatenate((targ_x, t_x))
        targ_y = np.concatenate((targ_y, t_y))
        
    model.train()
    tmp = sum(np.sqrt(np.square(pred_x - targ_x) + np.square(pred_y - targ_y)))/pred_x.shape[0]
    print(tmp)
    
    
# Get accuracy
score('valid', settings)
score('train', settings)

#### riiid-model-mha

In [None]:
# https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/210276
# https://medium.com/inside-machine-learning/what-is-a-transformer-d07dd1fbec04

# Multihead vs Transformer?
# This notebook seems to indicate a Transformer consists of the encoder and decoder blocks:
# https://www.kaggle.com/m10515009/saint-is-all-you-need-training-private-0-801

import gc
import pandas as pd
import numpy as np
import sklearn.metrics
import tqdm

import matplotlib.pyplot

import torch

settings = {}
settings['seq_len'] = 160
settings['n_content_id'] = 13525
settings['batch_size'] = 100
settings['embed_dim'] = 200
settings['n_train_rows'] = 5 * 1000000
settings['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dtype = {'timestamp':'int64', 
         'user_id':'int32' ,
         'content_id':'int16',
         'content_type_id':'int8',
         'answered_correctly':'int8'}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv'
                       ,usecols=[1, 2, 3, 4, 7]
                       ,dtype=dtype
                       ,nrows = settings['n_train_rows']
                      )

# Keep only questions
train_df = train_df[train_df.content_type_id == False]

# Arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)


# Group each user
train_group = train_df[['user_id', 'content_id', 'answered_correctly']]\
            .groupby('user_id')\
            .apply(lambda r: {'content_id' : r['content_id'].values
                             ,'answered_correctly' : r['answered_correctly'].values
                            })

del train_df
gc.collect()


# Make validation set
val_idx = np.random.choice(train_group.index, int(.1 * train_group.shape[0]), replace=False)
valid_group = train_group[val_idx].copy()
train_group.drop(valid_group.index, inplace=True)

class riiid_dataset(torch.utils.data.Dataset):
    
    def __init__(self, group, settings):
        super(riiid_dataset, self).__init__()
        self.seq_len = settings['seq_len']
        self.group = group
        
        # Take out people with only 1 interaction
        for user_id in self.group.index:
            if len(self.group[user_id]['content_id']) < 2:
                del self.group[user_id]
        
    def __len__(self):
        return(len(self.group))
    
    def __getitem__(self, index):
        # Get the relevant user row
        sample = self.group.iloc[index]
        
        # Get contents as np.int64s
        content_id = sample['content_id'].astype(np.int64)
        answered_correctly = sample['answered_correctly'].astype(np.int64)
        
        # Helper function to pad vector
        def pad(np_array, out_size=self.seq_len):
            n_pad = out_size - len(np_array)
            if n_pad > 0:
                np_array = np.concatenate((np.full(n_pad, 0).astype(np.int64), np_array))
            else:
                np_array = np_array[:out_size]
            return(np_array)
                
        content_id = pad(content_id)
        answered_correctly = pad(answered_correctly)
        prev_ac = pad(answered_correctly, self.seq_len + 1)
        prev_ac = prev_ac[:-1]
        
        # Return
        return({
            'content_id' : content_id
            ,'answered_correctly' : answered_correctly
            ,'prev_ac' : prev_ac
        })
    

train_dataset = riiid_dataset(group = train_group
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = settings['batch_size']
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = riiid_dataset(group = valid_group
                             ,settings = settings
                             )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                               ,batch_size = settings['batch_size']
                                               ,drop_last = True
                                              )


class encoder(torch.nn.Module):
    def __init__(self, settings):
        super(encoder, self).__init__()
        self.embed_dim = settings['embed_dim']
        self.n_content_id = settings['n_content_id']
        self.seq_len = settings['seq_len']
        self.device = settings['device']
        
        self.cid_embedding = torch.nn.Embedding(self.n_content_id, self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 8
                                                     ,dropout = 0.2)

        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.dropout = torch.nn.Dropout(0.2)
            
    def forward(self, batch):
        # Content embedding
        x = self.cid_embedding(batch['content_id'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        pos_x = self.pos_embedding(pos_id)
        
        # Add embeddings and permute
        x = x + pos_x
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention and permute back
        attn_mask = torch.from_numpy(np.triu(np.ones((self.seq_len,self.seq_len)), k=1)\
                                         .astype('bool')).to(self.device) # torch.triu does not have k argument
        attn_output, _ = self.multi_att(x, x, x, attn_mask = attn_mask)
        x = x + attn_output
        
        # Feed forward
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        x = self.dropout(x)

        # Return
        return(x)
        
class decoder(torch.nn.Module):
    def __init__(self, settings):
        super(decoder, self).__init__()
        self.embed_dim = settings['embed_dim']
        self.seq_len = settings['seq_len']
        self.device = settings['device']
        
        self.prev_ac_embedding = torch.nn.Embedding(10, self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att_1 = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 8
                                                     ,dropout = 0.2)
        self.multi_att_2 = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                      ,num_heads = 8
                                                      ,dropout = 0.2)
        
        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.dropout = torch.nn.Dropout(0.2)        
        
    def forward(self, batch, x):
        # Previous answered_correctly embedding
        y = self.prev_ac_embedding(batch['prev_ac'])
        
        # Position embedding
        pos_id = torch.arange(y.shape[1])[None, :].to(self.device)
        pos_y = self.pos_embedding(pos_id)
        
        # Add embeddings and permute
        y = y + pos_y
        y = y.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention 1
        attn_mask = torch.from_numpy(np.triu(np.ones((self.seq_len,self.seq_len)), k=1)\
                                         .astype('bool')).to(self.device) # torch.triu does not have k argument
        
        attn_output_1, _ = self.multi_att_1(y, y, y, attn_mask = attn_mask)
        y = y + attn_output_1
        
        # MultiHead Attention 2
        attn_output_2, _ = self.multi_att_2(y, x, x, attn_mask = attn_mask) # query, key, value
        y = y + attn_output_2
        
        # Permute back to [batch_size, seq_len, embed_dim]
        y = y.permute(1, 0, 2)
        
        # Feed forward
        y = self.lin_1(y)
        y = self.relu(y)
        y = self.lin_2(y)
        y = self.dropout(y)

        # Return
        return(y)
        
class riiid_model(torch.nn.Module):
    def __init__(self, settings):
        super(riiid_model, self).__init__()
        self.embed_dim = settings['embed_dim']
        self.seq_len = settings['seq_len']
        self.device = settings['seq_len']
        self.encoder = encoder(settings=settings)
        self.decoder = decoder(settings=settings)
        self.emb_to_seq = torch.nn.Linear(self.embed_dim, 1)
    
    def forward(self, batch):
        x = self.encoder(batch)
        y = self.decoder(batch, x)
        y = self.emb_to_seq(y)
        y = y[:,:,0]
        return(y)

        
# Setup model, optimizer and criterion
model = riiid_model(settings)
optimizer = torch.optim.Adam(model.parameters(), lr=.001)
criterion = torch.nn.BCEWithLogitsLoss()
all_auc = []

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])

# Get content_ids and pad
for _ in range(5):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch['answered_correctly'].float())
        loss.backward()
        optimizer.step()
        
        # For now, do AUC on only the last prediction
        t = batch['answered_correctly'][:, -1:][:, -1].detach().to('cpu').numpy()
        p = pred[:, -1:][:, -1].detach().to('cpu').numpy()
        auc = sklearn.metrics.roc_auc_score(t, p)
        all_auc.append(auc)
        
print(np.array(all_auc[-200:]).mean())
matplotlib.pyplot.plot(all_auc)
matplotlib.pyplot.show()

# Validation
val_ac = np.array([])
val_pred = np.array([])

for batch in valid_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].to(settings['device'])
    #optimizer.zero_grad()
    pred = model(batch)
    #loss = criterion(pred, batch['answered_correctly'].float())
    #loss.backward()
    #optimizer.step()

    # For now, do AUC on only the last prediction
    t = batch['answered_correctly'][:, -1:][:, -1].detach().to('cpu').numpy()
    p = pred[:, -1:][:, -1].detach().to('cpu').numpy()
    
    # Concatenate
    val_ac = np.concatenate((val_ac, t))
    val_pred = np.concatenate((val_pred, p))
        
sklearn.metrics.roc_auc_score(val_ac, val_pred)

#### riiid-model-transformer

In [None]:
# https://www.kaggle.com/m10515009/saint-is-all-you-need-training-private-0-801

import gc
import pandas as pd
import numpy as np
import sklearn.metrics
import tqdm

import matplotlib.pyplot

import torch

settings = {}
settings['seq_len'] = 160
settings['n_content_id'] = 13525
settings['batch_size'] = 100
settings['embed_dim'] = 256
settings['n_train_rows'] = 5 * 1000000
settings['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dtype = {'timestamp':'int64', 
         'user_id':'int32' ,
         'content_id':'int16',
         'content_type_id':'int8',
         'answered_correctly':'int8'}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv'
                       ,usecols=[1, 2, 3, 4, 7]
                       ,dtype=dtype
                       ,nrows = settings['n_train_rows']
                      )

# Keep only questions
train_df = train_df[train_df.content_type_id == False]

# Arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)


# Group each user
train_group = train_df[['user_id', 'content_id', 'answered_correctly']]\
            .groupby('user_id')\
            .apply(lambda r: {'content_id' : r['content_id'].values
                             ,'answered_correctly' : r['answered_correctly'].values
                            })

del train_df
gc.collect()


# Make validation set
val_idx = np.random.choice(train_group.index, int(.1 * train_group.shape[0]), replace=False)
valid_group = train_group[val_idx].copy()
train_group.drop(valid_group.index, inplace=True)

class riiid_dataset(torch.utils.data.Dataset):
    
    def __init__(self, group, settings):
        super(riiid_dataset, self).__init__()
        self.seq_len = settings['seq_len']
        self.group = group
        
        # Take out people with only 1 interaction
        for user_id in self.group.index:
            if len(self.group[user_id]['content_id']) < 2:
                del self.group[user_id]
        
    def __len__(self):
        return(len(self.group))
    
    def __getitem__(self, index):
        # Get the relevant user row
        sample = self.group.iloc[index]
        
        # Get contents as np.int64s
        content_id = sample['content_id'].astype(np.int64)
        answered_correctly = sample['answered_correctly'].astype(np.int64)
        
        # Helper function to pad vector
        def pad(np_array, out_size=self.seq_len):
            n_pad = out_size - len(np_array)
            if n_pad > 0:
                np_array = np.concatenate((np.full(n_pad, 0).astype(np.int64), np_array))
            else:
                np_array = np_array[:out_size]
            return(np_array)
                
        content_id = pad(content_id)
        answered_correctly = pad(answered_correctly)
        prev_ac = pad(answered_correctly, self.seq_len + 1)
        prev_ac = prev_ac[:-1]
        
        # Return
        return({
            'content_id' : content_id
            ,'answered_correctly' : answered_correctly
            ,'prev_ac' : prev_ac
        })
    

train_dataset = riiid_dataset(group = train_group
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = settings['batch_size']
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = riiid_dataset(group = valid_group
                             ,settings = settings
                             )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                               ,batch_size = settings['batch_size']
                                               ,drop_last = True
                                              )

class riiid_encoder(torch.nn.Module):
    def __init__(self, settings):
        super(riiid_encoder, self).__init__()
        self.device = settings['device']
        
        self.cid_embedding = torch.nn.Embedding(settings['n_content_id'], settings['embed_dim'])
        self.pos_embedding = torch.nn.Embedding(settings['seq_len'], settings['embed_dim'])
            
    def forward(self, batch):
        # Content embedding
        x = self.cid_embedding(batch['content_id'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        pos_x = self.pos_embedding(pos_id)
        
        # Add embeddings and permute
        x = x + pos_x
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]

        # Return
        return(x)
        
class riiid_decoder(torch.nn.Module):
    def __init__(self, settings):
        super(riiid_decoder, self).__init__()
        self.device = settings['device']
        
        self.prev_ac_embedding = torch.nn.Embedding(2, settings['embed_dim'])
        self.pos_embedding = torch.nn.Embedding(settings['seq_len'], settings['embed_dim'])
        
        
    def forward(self, batch):
        # Previous answered_correctly embedding
        y = self.prev_ac_embedding(batch['prev_ac'])
        
        # Position embedding
        pos_id = torch.arange(y.shape[1])[None, :].to(self.device)
        pos_y = self.pos_embedding(pos_id)
        
        # Add embeddings and permute
        y = y + pos_y
        y = y.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # Return
        return(y)
        
class riiid_model(torch.nn.Module):
    def __init__(self, settings):
        # Initialize
        super(riiid_model, self).__init__()
        self.seq_len = settings['seq_len']
        self.device = settings['device']
        
        # Encoder and Decoder
        self.encoder = riiid_encoder(settings)
        self.decoder = riiid_decoder(settings)
        
        # Transformer
        self.transformer = torch.nn.Transformer(nhead = 8
                                                ,d_model = settings['embed_dim']
                                                ,num_encoder_layers = 2
                                                ,num_decoder_layers = 2
                                                ,dropout = .2
                                               )
        
        # FFN
        self.layer_norm_1 = torch.nn.LayerNorm(settings['embed_dim']) 
        self.lin_1 = torch.nn.Linear(settings['embed_dim'], settings['embed_dim'])
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(settings['embed_dim'], settings['embed_dim'])
        self.dropout = torch.nn.Dropout(.2)        
        self.layer_norm_2 = torch.nn.LayerNorm(settings['embed_dim'])
        
        # Prediction
        self.pred = torch.nn.Linear(settings['embed_dim'], 1)
    
    def forward(self, batch):
        # Get encodings/decodings
        x = self.encoder(batch)
        y = self.decoder(batch)
        
        # Put them through transformer
        mask = torch.from_numpy(np.triu(np.ones((self.seq_len,self.seq_len)), k=1)\
                         .astype('bool')).to(self.device) # torch.triu does not have k argument
        t = self.transformer(src = x
                             ,tgt = y
                             ,src_mask = mask
                             ,tgt_mask = mask
                             ,memory_mask = mask
                            )
        t = self.layer_norm_1(t)
        t = t.permute(1, 0, 2)
        
        # FFN        
        z = self.lin_1(t)
        z = self.relu(z)
        z = self.lin_2(z)
        z = self.dropout(z)
        
        # LayerNorm with z + t
        z = self.layer_norm_2(z + t)
        
        
        # Permute and predict
        z = self.pred(z)
        z = z[:,:,0]
        return(z)

        
# Setup model, optimizer and criterion
model = riiid_model(settings)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001)
criterion = torch.nn.BCEWithLogitsLoss()
all_auc = []

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])


# Get content_ids and pad
for _ in range(10):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch['answered_correctly'].float())
        loss.backward()
        optimizer.step()
        
        # For now, do AUC on only the last prediction
        t = batch['answered_correctly'][:, -1:][:, -1].detach().to('cpu').numpy()
        p = pred[:, -1:][:, -1].detach().to('cpu').numpy()
        auc = sklearn.metrics.roc_auc_score(t, p)
        all_auc.append(auc)
        
        
print(np.array(all_auc[-200:]).mean())
matplotlib.pyplot.plot(all_auc)
matplotlib.pyplot.show()


# Validation
val_ac = np.array([])
val_pred = np.array([])

for batch in valid_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].to(settings['device'])
        
    # Get predictions
    pred = model(batch)
    
    # For now, do AUC on only the last prediction
    t = batch['answered_correctly'][:, -1:][:, -1].detach().to('cpu').numpy()
    p = pred[:, -1:][:, -1].detach().to('cpu').numpy()
    
    # Concatenate
    val_ac = np.concatenate((val_ac, t))
    val_pred = np.concatenate((val_pred, p))
        
sklearn.metrics.roc_auc_score(val_ac, val_pred)