In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/githubactivities/userdata.npz
/kaggle/input/githubactivities/tmp.mm
/kaggle/input/githubactivities/scalers.pkl.z
/kaggle/input/githubactivities/ml.npz
/kaggle/input/githubactivities/users.json
/kaggle/input/notebook13088a6bed/model.pth
/kaggle/input/notebook13088a6bed/epoch=12-val_loss=0.231.ckpt
/kaggle/input/notebook13088a6bed/epoch=17-val_loss=0.230.ckpt
/kaggle/input/notebook13088a6bed/epoch=8-val_loss=0.232.ckpt
/kaggle/input/notebook13088a6bed/epoch=13-val_loss=0.231.ckpt
/kaggle/input/notebook13088a6bed/epoch=14-val_loss=0.231.ckpt
/kaggle/input/notebook13088a6bed/epoch=15-val_loss=0.231.ckpt
/kaggle/input/notebook13088a6bed/__results__.html
/kaggle/input/notebook13088a6bed/epoch=16-val_loss=0.230.ckpt
/kaggle/input/notebook13088a6bed/last.ckpt
/kaggle/input/notebook13088a6bed/__notebook__.ipynb
/kaggle/input/notebook13088a6bed/epoch=9-val_loss=0.232.ckpt
/kaggle/input/notebook13088a6bed/__output__.json
/kaggle/input/notebook13088a6bed/epoch=11-val_loss=0.232.ckpt


In [2]:
def has_cuda():
    """check cuda without having to import pytorch into the current process"""
    import os
    res = os.system('python -c "import torch; import sys; sys.exit(14 if torch.cuda.is_available() else 7)"')
    return ((res >> 8) & (255)) == 14

In [3]:
import math
import time
import torch
import random
from torch import Tensor
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau, CyclicLR
from torch.nn import functional as F
from pathlib import Path

torch.set_num_threads(os.cpu_count())
from torch.utils.data import Dataset, DataLoader

In [4]:
class GithubContribDataset(Dataset):
    def __init__(self, base_path='.', seq_size=64):
        self.base_path = Path(base_path)
        self.seq_size = seq_size
        self._len = 0
        self.data = dict()
        self.weeks = None
        self.weeks_cluster = None
        self.scaled_weeks = None
        self.weeks_target = None
        self.users = None
        
    def load(self):
        self.data = np.load(self.base_path / 'ml.npz')
        scaled_weeks = self.scaled_weeks = self.data['scaled_weeks']
        self.users = self.data['users'].astype('float32')
        self.weeks = self.data['weeks'].astype('float32')
        self.weeks_cluster = self.data['weeks_cluster'].astype('int32')
        self._len = len(scaled_weeks) * (scaled_weeks.shape[1] - self.seq_size - 7)
        
        self.weeks_target = np.concatenate((self.weeks[:, :, None], self.weeks_cluster[:, :, None]), axis=2).astype('float32')
        
    def __len__(self):
        return self._len

    
    def __getitem__(self, key):
        weeks = self.weeks
        idx = key // (weeks.shape[1] - self.seq_size - 7)
        idx2 = key % (weeks.shape[1] - self.seq_size - 7)
        users = self.users
        return (
            torch.tensor([idx2 % 7]),
            torch.from_numpy(users[idx]),
            torch.from_numpy(self.scaled_weeks[idx, idx2:idx2 + self.seq_size]),
            torch.from_numpy(self.weeks_target[idx, idx2 + self.seq_size:idx2 + self.seq_size + 7])
        )

In [5]:
ds = GithubContribDataset("/kaggle/input/githubactivities")
ds.load()
len(ds)

1970181

In [6]:
%%time
for i in range(len(ds)):
    ds[i]

CPU times: user 13.9 s, sys: 58.6 ms, total: 14 s
Wall time: 14 s


In [7]:
ds[random.randint(0, len(ds))]

(tensor([3]),
 tensor([-4.7285e-01,  7.3039e-01, -1.6775e+00, -7.1764e-01,  8.6928e-01,
          3.5468e-01,  5.4589e-02, -6.5455e-01, -6.3050e-02,  9.8204e-01,
          7.0088e-01, -1.0370e+00,  1.0314e+00, -7.8577e-01, -7.7829e-01,
         -1.0674e+00, -1.3422e+00, -1.5885e+00, -7.7949e-01, -8.5064e-01,
         -9.2992e-01, -8.9507e-01, -1.0089e+00, -1.1305e+00, -8.9695e-01,
         -1.0170e+00, -1.1388e+00, -8.9337e-01, -1.0119e+00, -1.1318e+00,
         -8.9248e-01, -1.0108e+00, -1.1326e+00, -8.8432e-01, -9.9247e-01,
         -1.1023e+00, -7.8680e-01, -8.5840e-01, -9.3887e-01,  1.4781e-02,
          4.0327e-02, -1.1220e-01,  2.7905e-02,  5.2399e-02,  2.6352e-01,
          3.0714e-02,  8.2176e-02,  2.3271e-01,  2.6841e-02,  8.6138e-02,
          3.5421e-01,  5.2337e-02,  7.9234e-02,  4.3451e-01,  4.2634e-02,
          8.9685e-02,  6.9637e-01, -8.2370e-01, -1.2952e-03,  1.4602e+00]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [8]:
import pytorch_lightning as pl

In [9]:
class SwitchableNorm1d(nn.Module):
    """Simplification over https://arxiv.org/abs/1907.10473
    by using only batchnorm and layernorm.
    Allowing the layer to work with both conv data shape (N, C, H) and linear shape (N, *, F)
    """
    def __init__(self, num_features, normalized_shape=None, weight_shape=None, momentum=0.1, bias=True):
        super().__init__()
        if normalized_shape is None:
            normalized_shape = num_features
        self.bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.ln = nn.LayerNorm(normalized_shape)
        if weight_shape is None:
            weight_shape = normalized_shape
            if isinstance(normalized_shape, int) or len(normalized_shape) == 1:
                weight_shape = (num_features,)
        self.weight = nn.Parameter(torch.empty(weight_shape))
        torch.nn.init.normal_(self.weight, 0, 0.7)
        if bias:
            self.bias = nn.Parameter(torch.zeros(weight_shape))
        else:
            self.bias = None
        
    def forward(self, input: Tensor):
        bn = self.bn(input)
        ln = self.ln(input)
        weight = self.weight.sigmoid()
        if weight.dim() < input.dim() and weight.size(-1) != input.size(-1):
            # assuming to work with convolution shape (N, C, H, *)
            pad = tuple(1 for _ in range(input.dim() - 1 - weight.dim()))
            weight = weight.view(1, *weight.shape, *pad)
            assert weight.dim() == input.dim()
        res = weight * bn + (1 - weight) * ln
        if self.bias is not None:
            res = res + self.bias
        return res

class GCModel(nn.Module):
    def __init__(self, seq_size: int, user_features: int):
        super().__init__()
        hidden_size = 256
        
        self.contrib_w = nn.Parameter(torch.empty((seq_size, user_features, 8), dtype=torch.cfloat))
        torch.nn.init.kaiming_normal_(self.contrib_w)
        
        self.contrib_b = nn.Parameter(torch.empty((7, seq_size, user_features, 8), dtype=torch.cfloat))
        torch.nn.init.xavier_uniform_(self.contrib_b, 0.5)
        
        self.contrib_alpha = nn.Parameter(torch.empty((1, 1, user_features), dtype=torch.cfloat))
        torch.nn.init.kaiming_normal_(self.contrib_alpha)
        
        self.hidden_alpha = nn.Parameter(torch.empty((7, user_features), dtype=torch.cfloat))
        torch.nn.init.kaiming_normal_(self.hidden_alpha)
        
        self.user_bn = SwitchableNorm1d(user_features, momentum=0.01)
        
        self.hidden = nn.Sequential(
            nn.Conv1d(user_features, hidden_size, 1),
            nn.Mish(inplace=True),
            nn.Conv1d(hidden_size, hidden_size, 7, 7),
            SwitchableNorm1d(hidden_size, seq_size // 7, (hidden_size, 1)),
            nn.Mish(inplace=True),
            nn.Conv1d(hidden_size, user_features, 3)
        )
        
        self.final = nn.Sequential(
            SwitchableNorm1d(7 * user_features, momentum=0.01),
            nn.Linear(7 * user_features, hidden_size),
            nn.Mish(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 7 * 6 * 2, bias=True)
        )
        
    def forward(self, day: Tensor, users: Tensor, contribs: Tensor):
        contrib_b = self.contrib_b[None, :, :, :, :].expand(day.size(0), -1, -1, -1, -1)
        contrib_b = torch.gather(contrib_b, 1, day[:, :, None, None, None].expand(-1, 1, *contrib_b.size()[2:]))
        
        if self.training:
            contrib_mask = torch.rand_like(contribs) > torch.randint(0, 5, (contribs.size(0), 1), device=contribs.device, dtype=contribs.dtype) / 100
            contribs_std, contribs_mean = torch.std_mean(contribs)
            contribs_std2, contribs_mean2 = torch.std_mean(contribs, dim=1, keepdim=True)
            
            contribs_std = torch.where(
                torch.randint_like(contribs, 0, 3) == 0,
                contribs_std,
                contribs_std2
            )
            contribs_mean = torch.where(
                torch.randint_like(contribs, 0, 3) == 0,
                contribs_mean,
                contribs_mean2
            )
            
            noise = torch.normal(contribs_mean, contribs_std).to(device=contribs.device)
            mult = torch.randint_like(noise, 0, 101).float() / 100
            contribs = torch.where(
                torch.randint_like(contribs, 0, 11) > 9,
                (1 - mult) * contribs + noise * mult,
                contribs + (noise - contribs_mean) * torch.randint_like(noise, 0, 5).float() / 100
            )
            contribs = contribs * contrib_mask
            contribs = contribs.relu()
        
        contribs_x = torch.fft.fft(contribs[:, :, None, None], dim=1, norm='ortho') * self.contrib_w + contrib_b.squeeze_(1)
        
        users = self.user_bn(users)
        if self.training:
            users_std = torch.std(users, dim=0, keepdim=True).expand_as(users)
            noise = torch.normal(0, users_std).to(device=users.device)
            users = users + noise.detach() * torch.randint_like(noise, 0, 5) / 100
        contribs_x = torch.fft.fft(users, dim=1, norm='ortho')[:, None, :, None].expand_as(contribs_x) * torch.fft.fft(contribs_x, dim=3)
        contribs_x = torch.fft.ifft(contribs_x, dim=3)
        contrib_alpha = torch.sigmoid(self.contrib_alpha)
        contribs_x = contrib_alpha * torch.amax(torch.abs(contribs_x), dim=3) + (1 - contrib_alpha) * torch.mean(contribs_x, dim=3)
        acontribs_x = torch.abs(contribs_x)
        contribs_x = contribs_x * (acontribs_x > torch.quantile(acontribs_x, 0.1, dim=1, keepdim=True)) # mimic fft filters
        contribs_x = contribs_x * (acontribs_x > torch.quantile(acontribs_x, 0.1, dim=2, keepdim=True)) # on both axis
        contribs_x = torch.fft.ifft2(contribs_x, dim=(1, 2), norm='ortho')
        hidden = self.hidden(contribs_x.real.permute(0, 2, 1)).permute(0, 2, 1)
        hidden_alpha = self.hidden_alpha.tanh()
        hidden = (torch.fft.fft2(hidden, dim=(2, 1)) * hidden_alpha +
                  torch.fft.fft(contribs_x[:, -hidden.size(1):, :], dim=1) * (1 - hidden_alpha))
        ahidden = torch.abs(hidden)
        hidden = hidden * (ahidden > torch.quantile(ahidden, 0.3, dim=2, keepdim=True)) 
        hidden = torch.fft.ifft(hidden, dim=1)
        res = self.final(hidden.real.reshape(hidden.size(0), -1)).view(hidden.size(0), 6, 7, 2)
        return res

In [10]:
model = GCModel(ds.seq_size, ds.users.shape[-1])
display(model)
model.eval()(*(x[None, :] for x in ds[800][:-1]))

GCModel(
  (user_bn): SwitchableNorm1d(
    (bn): BatchNorm1d(60, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
    (ln): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
  )
  (hidden): Sequential(
    (0): Conv1d(60, 256, kernel_size=(1,), stride=(1,))
    (1): Mish(inplace=True)
    (2): Conv1d(256, 256, kernel_size=(7,), stride=(7,))
    (3): SwitchableNorm1d(
      (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (ln): LayerNorm((9,), eps=1e-05, elementwise_affine=True)
    )
    (4): Mish(inplace=True)
    (5): Conv1d(256, 60, kernel_size=(3,), stride=(1,))
  )
  (final): Sequential(
    (0): SwitchableNorm1d(
      (bn): BatchNorm1d(420, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
      (ln): LayerNorm((420,), eps=1e-05, elementwise_affine=True)
    )
    (1): Linear(in_features=420, out_features=256, bias=True)
    (2): Mish(inplace=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linea

tensor([[[[-0.0115,  0.0931],
          [-0.0532, -0.0965],
          [-0.0318,  0.0230],
          [ 0.0405, -0.0125],
          [ 0.0841, -0.0809],
          [ 0.0189, -0.1827],
          [ 0.1045,  0.0819]],

         [[-0.1667, -0.0572],
          [ 0.1296,  0.1146],
          [ 0.2280, -0.0935],
          [-0.0907, -0.2002],
          [ 0.1630, -0.0994],
          [-0.0371, -0.1117],
          [ 0.0523,  0.0641]],

         [[ 0.0057, -0.0037],
          [ 0.0065, -0.0245],
          [-0.1038,  0.0017],
          [-0.1934, -0.0527],
          [ 0.1316,  0.2234],
          [ 0.0480, -0.1016],
          [ 0.0940,  0.0520]],

         [[ 0.2587,  0.2097],
          [-0.0597,  0.1296],
          [-0.1755, -0.0848],
          [-0.1267,  0.0121],
          [ 0.1301, -0.1027],
          [-0.2903,  0.0246],
          [-0.0526, -0.0963]],

         [[-0.1741,  0.0632],
          [-0.0169,  0.1672],
          [ 0.1007,  0.0219],
          [ 0.0610, -0.0739],
          [-0.0916,  0.0044],
  

In [11]:
weight = np.sum(np.log1p(ds.data['value_counts'])) / np.log1p(ds.data['value_counts'])
weight /= np.min(weight)
weight

array([1.        , 1.27747819, 1.30533738, 1.42618265, 1.47712144,
       1.57018664])

In [12]:
class CLoss(nn.Module):
    def __init__(self, weight):
        super().__init__()
        weight=torch.tensor(weight).float()
        self.register_buffer('rescale_weight', weight)
        self.ce = nn.CrossEntropyLoss(weight, reduction='none')
        self.regr = nn.SmoothL1Loss(reduction='none')
        seq_weight = 1 / (torch.arange(7) + 1)[None, :]
        self.register_buffer('seq_weight', seq_weight)
        
    def forward(self, preds, y_true):
        ce = self.ce(preds[:, :, :, 1], y_true[:, :, 1].long())
        x = torch.gather(preds[:, :, :, 0], 1, y_true[:, None, :, 1].long())
        regr = self.regr(x.squeeze(1), y_true[:, :, 0])
        regr_full = self.regr(preds[:, :, :, 0], y_true[:, None, :, 0]) * preds[:, :, :, 1].softmax(1)
        rescale_weight = torch.gather(self.rescale_weight[None, :, None].expand(y_true.size(0), -1, y_true.size(1)), 1, y_true[:, None, :, 1].long())
        return ((regr +
                 regr_full.sum(1) * rescale_weight.squeeze(1) / torch.amax(self.rescale_weight) +
                 ce) * self.seq_weight).mean()

In [13]:
# use 20% of training data for validation
train_set_size = int(len(ds) * 0.8)
valid_set_size = len(ds) - train_set_size

# split the train set into two
seed = torch.Generator().manual_seed(42)
train_set, valid_set = torch.utils.data.random_split(ds, [train_set_size, valid_set_size], generator=seed)

In [14]:
class GCLightningModule(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss_model = CLoss(weight=weight)
        self.eval_loss_model = CLoss(weight=weight).eval()
        self.batch_size = getattr(self, 'batch_size', 0) or 64
        self.lr = getattr(self, 'lr', 0) or 1e-3
        self.save_hyperparameters(ignore=['model'])

    def training_step(self, batch, batch_idx):
        day, users, contribs, target = batch
        y = self.model(day, users, contribs)
        loss = self.loss_model(y, target)
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        day, users, contribs, target = batch
        y = self.model(day, users, contribs)
        test_loss = self.eval_loss_model(y, target)
        self.log("val_loss", test_loss)
        
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        return self.model(batch)
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train_set, batch_size=self.batch_size or self.hparams.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(valid_set, batch_size=512)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr or self.learning_rate)
        return optimizer
    
gc_module = GCLightningModule(model)

In [15]:
train_loader = torch.utils.data.DataLoader(ds, 8, shuffle=True)
len(train_loader)

246273

In [16]:
from pytorch_lightning.callbacks import StochasticWeightAveraging, EarlyStopping, ModelCheckpoint

class MyStochasticWeightAveraging(StochasticWeightAveraging):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._average_model = getattr(self, '_average_model', None)

In [17]:
trainer = pl.Trainer(limit_train_batches=10000, max_epochs=50, deterministic=False, max_time="00:11:00:00", callbacks=[
    #MyStochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=0.5, annealing_epochs=5),
    EarlyStopping(monitor="val_loss", mode="min", patience=10, verbose=True, check_finite=True, strict=True),
    ModelCheckpoint(".", monitor="val_loss", verbose=True, filename='{epoch}-{val_loss:.3f}', save_last=True, save_top_k=10)
], auto_lr_find=False, accumulate_grad_batches={0: 1, 1: 1, 4: 2, 6: 4, 8: 16, 16:32}, gradient_clip_val=5.0,
                    devices="auto", accelerator="auto")
#trainer.tune(gc_module)
trainer.fit(model=gc_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)


Training: 0it [00:00, ?it/s]

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


Validation: 0it [00:00, ?it/s]

  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [18]:
data = ds[random.randint(0, len(ds))]
with torch.no_grad():
    result = model.cpu().eval()(*(x[None, :] for x in data[:-1]))
    am = result[:,:,:,1].argmax(1, keepdims=True)
    contribs = torch.gather(result[:,:,:,0], 1, am)
    print(am, contribs.numpy(), data[-1].numpy())

tensor([[[0, 0, 0, 0, 0, 0, 0]]]) [[[ 0.010768   -0.0129638  -0.00517563 -0.01096566  0.00877456
   -0.01079096 -0.00379558]]] [[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [19]:
torch.save(model.state_dict(), 'model.pth')