In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install pytorch-lightning -q

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import pytorch_lightning  as L
from pytorch_lightning.strategies import FSDPStrategy
from pytorch_lightning.demos import Transformer, WikiText2

In [None]:
class LanguageModel(L.LightningModule):
    def __init__(self, vocab_size):
        super().__init__()
        self.model = Transformer(  # 1B parameters
            vocab_size=vocab_size,
            nlayers=32,
            nhid=4096,
            ninp=1024,
            nhead=64,
        )

    def training_step(self, batch):
        input, target = batch
        output = self.model(input, target)
        loss = F.nll_loss(output, target.view(-1))
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.1)

In [None]:
L.seed_everything(42)

# Data
dataset = WikiText2()
train_dataloader = DataLoader(dataset, num_workers=2)

In [None]:
dataset.vocab_size

In [None]:
# Model
model = LanguageModel(vocab_size=dataset.vocab_size)

In [None]:
# Trainer
trainer = L.Trainer(accelerator="cuda", devices=2, strategy="ddp_notebook") # strategy=FSDPStrategy()


In [None]:
trainer.fit(model, train_dataloader)
trainer.print(torch.cuda.memory_summary())