In [1]:
import torch
import lightning.pytorch as pl
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

In [2]:
MODEL="rinna/japanese-gpt2-medium"

### Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading

### Dataset

In [4]:
class Dataset(torch.utils.data.DataLoader):
    def __init__(self, filepath):
        with open(filepath) as f:
            self.data = f.readlines()
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [5]:
raw_dataset = Dataset('data/train.txt')
for i in range(3):
    print(raw_dataset[i])
del raw_dataset

そろそろおやすみかな？　 今日も一日、おつかれさま～。

…お兄ちゃん、いつもこんな時間まで起きてるの？

…まさか寝てないってことはないよね？



### DataModule

In [6]:
class DataModule(pl.LightningDataModule):
    def __init__(self, train, val, test, batch_size=4, num_workers=4):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.train = train
        self.val = val
        self.test = test

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train_dataset = Dataset(self.train)

        if stage == 'validate' or stage is None:
            self.val_dataset = Dataset(self.val)
        
        if stage == 'test' or stage is None:
            self.test_dataset = Dataset(self.test)

        if stage == 'predict' or stage is None:
            self.predict_dataset = Dataset(self.test)


    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.tokenize,
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.tokenize,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.tokenize,
        )

    def tokenize(self, batch):
        inputs = tokenizer.batch_encode_plus(
            batch, 
            padding=True, 
            return_tensors='pt'
        )
        return inputs, inputs["input_ids"]

    

In [7]:
datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
)

datamodule.setup()
for batch in datamodule.train_dataloader():
    x, y = batch
    print(x.keys())
    print(y.shape)
    break

del datamodule

dict_keys(['input_ids', 'attention_mask'])
torch.Size([4, 22])


### Model

In [8]:
class GPT2(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(MODEL).to(self.device)

    def forward(self, x, y):
        return self.model(**x, labels=y)

    def training_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def validation_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def test_step(self, batch, batch_idx):
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_loss', avg_loss)

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('test_loss', avg_loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

In [9]:
datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
)
datamodule.setup()

model = GPT2().to('cuda')
for batch in datamodule.train_dataloader():
    batch = [x.to('cuda') for x in batch]
    x, y = batch
    
    outputs = model(x, y)
    print(outputs.loss)
    print(outputs.logits.shape)
    break

del model
del datamodule

tensor(9.9134, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([4, 22, 32000])


### Train

In [10]:
class LitProgressBar(pl.callbacks.TQDMProgressBar):
    def init_validation_tqdm(self):
        return tqdm(disable=True)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2().to(device)

datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
)
datamodule.setup()

trainer = pl.Trainer(
    gpus=1,
    accelerator='gpu',
    max_epochs=10,
    # overfit_batches=1,
    logger=pl.loggers.TensorBoardLogger('logs/', name='gpt2'),
    callbacks=[LitProgressBar()],
)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(
    model=model,
    datamodule=datamodule,
)

You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 336 M 
------------------------------------------
336 M     Trainable params
0         Non-trainable params
336 M     Total params
1,344.512 Total estimated model params size (MB)
2023-08-23 22:15:59.431512: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


### Evaluate

In [20]:
model = model.to(device)
model.eval()

input = tokenizer.encode("こんにちは", return_tensors="pt").to(device)
output = model.model.generate(input, max_length=100, do_sample=True, top_p=0.95, top_k=60)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


こんにちは</s> [PAD] [PAD] ||||||||||匐お嬢様っ!「むふふ、むふふ...。『こりゃあ...、水月先輩に片思いしてるやんけっ!』</s>
