In [1]:
import torch
import lightning.pytorch as pl
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from typing import List
from itertools import chain
import numpy as np

In [2]:
MODEL="rinna/japanese-gpt2-medium"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading

### Dataset

In [4]:
def group_texts(texts: List[str]):
    # 全ての文書を結合し、トークン化
    examples = tokenizer.batch_encode_plus(texts)
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}

    # トークン化された文書をブロックサイズに分割
    block_size = tokenizer.model_max_length
    if block_size > 1024:
        block_size = 1024
    
    total_length = len(concatenated_examples["input_ids"])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [5]:
class Dataset(torch.utils.data.DataLoader):
    def __init__(self, filepath):
        with open(filepath) as f:
            self.data = group_texts(f.readlines())
    
    def __len__(self):
        keys = list(self.data.keys())
        return len(self.data[keys[0]])

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.data.items()}
    

### DataModule

In [6]:
class DataModule(pl.LightningDataModule):
    def __init__(self, train, val, test, batch_size=4, num_workers=4):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device

        self.train = train
        self.val = val
        self.test = test

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train_dataset = Dataset(self.train)

        if stage == 'validate' or stage is None:
            self.val_dataset = Dataset(self.val)
        
        if stage == 'test' or stage is None:
            self.test_dataset = Dataset(self.test)

        if stage == 'predict' or stage is None:
            self.predict_dataset = Dataset(self.test)


    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.collate_fn,
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.collate_fn,
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        keys = batch[0].keys()
        data = {k: torch.stack([b[k] for b in batch]) for k in keys}
        return data["input_ids"], data["labels"]

### Model

In [7]:
class GPT2(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(MODEL).to(self.device)

    def forward(self, x, y):
        return self.model(x, labels=y)

    def training_step(self, batch, batch_idx):
        self.model.train()
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def validation_step(self, batch, batch_idx):
        self.model.eval()
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def test_step(self, batch, batch_idx):
        self.model.eval()
        x, y = batch
        outputs = self(x, y)
        loss = outputs.loss
        return {
            'loss': loss,
        }

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_loss', avg_loss)

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('test_loss', avg_loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)

### Train

In [8]:
class LitProgressBar(pl.callbacks.TQDMProgressBar):
    def init_validation_tqdm(self):
        return tqdm(disable=True)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2().to(device)

datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
    batch_size=1,
)
datamodule.setup()

trainer = pl.Trainer(
    gpus=1,
    accelerator='gpu',
    max_epochs=3,
    # overfit_batches=1,
    logger=pl.loggers.TensorBoardLogger('logs/', name='gpt2'),
    callbacks=[LitProgressBar()],
)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(
    model=model,
    datamodule=datamodule,
)

You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 336 M 
------------------------------------------
336 M     Trainable params
0         Non-trainable params
336 M     Total params
1,344.512 Total estimated model params size (MB)
2023-08-24 15:29:15.537455: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild 

Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


### Evaluate

In [13]:
model = model.to(device)
model.eval()

input = tokenizer.encode("おはよう、お兄ちゃん。", return_tensors="pt").to(device)
output = model.model.generate(input, max_length=100, do_sample=True, top_p=0.95, top_k=60, num_return_sequences=8)
for text in tokenizer.batch_decode(output):
    print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


おはよう、お兄ちゃん。</s> 昨日、お兄ちゃんと妹と3人で夕飯を食べました。 1番の食べ場所は、やっぱり、やっぱりカレー。 3つの食べ方 カレー スープカレー カレー 玉子焼き スパゲッティー カレーパン カレーに合うおつまみ カレーと... #お兄ちゃんのことなんかぜんぜん好きじゃないんだからねっ!! #10回読んでしまった #
おはよう、お兄ちゃん。</s> #11 <unk> い月だよね、みんな。 <unk> い月だよね、みんな。 「おはようお兄ちゃん、お兄ちゃん。 #11 ねぇ、お兄ちゃん。 」(おはよう、お兄ちゃん。 )は、マミたんの5thシングル。 2013年5月24日にlantisから発売された。 前作「あなたがいるなら」から約2か月ぶりのリリースとなる2013年2作目のシングル。 表題曲「<unk> い
おはよう、お兄ちゃん。</s> ちょっとだけ、お姉ちゃんって呼んでいい? 僕の唇にキスしてくれない? ふちに...僕の精液が...ふちに...</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>
おはよう、お兄ちゃん。</s> とってもいい天気! 今日も1日雨降らないといいなぁ(^-^; とりあえずお兄ちゃん、仕事する?お昼に会いたいからね～</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>

In [16]:
gpt2 = AutoModelForCausalLM.from_pretrained(MODEL).to(device)
gpt2.eval()
inputs = tokenizer.encode("おはよう、お兄ちゃん。", return_tensors="pt").to(device)
outputs = gpt2.generate(inputs, max_length=100, do_sample=True, top_p=0.95, top_k=60)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


おはよう、お兄ちゃん。</s> 」(おにいちゃん。 )は、原作:有元カゲロウ、漫画:北岡まりあによる日本の漫画。 『ちゃお』(小学館)で2007年6月号から2007年8月号まで連載、単行本は全11巻。 有元カゲロウが『ちゃお』でのデビューから10周年を迎えた2007年11月号に「ちゃお 特別編」として『ちゃお』vol. 2に掲載された。 作者のtwitterでは、同誌


### Playgrounds

In [None]:
raw_dataset = Dataset('data/train.txt')
for i in range(1):
    print(raw_dataset[i])
del raw_dataset

{'input_ids': tensor([    9,  1010,  1406,  ..., 21433,   552,     2]), 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]), 'labels': tensor([    9,  1010,  1406,  ..., 21433,   552,     2])}


In [32]:
datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
    batch_size=1,
)

datamodule.setup()
for batch in datamodule.train_dataloader():
    x, y = batch
    texts = tokenizer.batch_decode(x)
    for text in texts:
        print(text)
        print(len(text))
        print("="*80)

del datamodule

そろそろおやすみかな? 今日も一日、おつかれさま～。</s>...お兄ちゃん、いつもこんな時間まで起きてるの?</s>...まさか寝てないってことはないよね?</s> 朝ごはんはきちんと食べようね♪</s> 今日も一日、張り切っていこう!</s> 今起きたところっていう人もいるかな?</s> お昼休みはウキウキウォッチングだね♪</s> この時間帯は一番...眠くなるよね...。</s> 18時台のアニメのビデオ予約大丈夫!?忘れてない!?</s> 今日のお夕飯は何だろう? アレだったら嬉しいな～♪</s> 一般的なテレビも見ずに、積みゲーでも消化してるとこかな?</s> 今日はお兄ちゃんにとって、良い一日だった?</s> お兄ちゃん、おはよう!</s> こんにちは、お兄ちゃん。</s> こんばんは、お兄ちゃん。</s> ふーん...。水月先輩のえっちな画像、たくさん見たんだね...(汗</s> やった!じゃあ、最近覚えたての歌を歌うね。あー、コホン。パンパラパパンパンパンパラパパンパン(前奏)パンパラパパンパンパラッパパンパンパン(前奏)行こうよまぶしい光の世界♪ハートのスイッチオンにして～♪ごらんよ誰かが君を待ってる♪おんなじかたちの夢抱いて～♪あ～し～た～がす～き～な～ひ～と～だ～けが～ち～きゅ～う～をま～わ～す～♪ハッロー!ソフマッ ワー♪言葉はいらないほほえみあえば♪たちまち素敵なともだちさ～♪心と心を響かせあって♪愛を歌おうよウィーアーソフマッワー♪時間の流れをさあ追い抜いて♪迎えに行こうよしあわせを～♪誰にも見えない新しい道♪ひとあしお先に走るんだ～♪あ～つ～い～しせ～ん～の～ひ～と～だ～けが～ち～きゅ～う～をま～わ～す～♪ハッロー!ソフマッ ワー♪コバルト色したおおきな空は♪未来を映せるスクリーン♪心と心の絵の具を混ぜて♪愛を描こうよウィーアーソフマッワー♪おいでよ不思議が呼んでる世界♪ハートのスピードフルにして～♪君ならできるさ大人にな～ら♪時代にできない冒険が～♪じ～ゆ～う～のに～あ～う～ひ～と～だ～けが～ち～きゅ～う～をま～わ～す～♪ハッロー!ソフマッ ワー♪言葉はいらないほほえみあえば♪たちまち素敵なともだちさ～♪心と心を響かせあって♪愛を歌おうよウィーアーソフマッワー♪ウィーアーソフマッワー♪・・・・・・・・。...まさかフルコーラス歌える

In [8]:
datamodule = DataModule(
    train='data/train.txt',
    val='data/train.txt',
    test='data/train.txt',
)
datamodule.setup()

try:
    model = GPT2().to('cuda')
    for batch in datamodule.train_dataloader():
        batch = [x.to('cuda') for x in batch]
        x, y = batch
        
        outputs = model(x, y)
        print(outputs.loss)
        print(outputs.logits.shape)
        print("input", tokenizer.decode(x[0])[-20:])
        print("output", tokenizer.decode(outputs.logits[0].argmax(dim=-1).tolist()))
        break
except RuntimeError as e:
    print(e)

tensor(3.7355, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([3, 1024, 32000])


2023-08-24 12:36:37.413314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-24 12:36:37.482720: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-24 12:36:37.775123: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-08-24 12:36:37.775151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

input が出てきたら萎えるよね?(ニヤソ</s>
output ろそろ、正月つなさい? とは一日おお疲れかれさまで。 今日今日 やちゃん、おありがとうにまで起きてるの?......おか、坊ないよことはないよね?......ごはんは、食べてるね。......も一日、お切ってがね...夜らは、のけど?...やごは、チウキ気分ング♪ね♪...時間は、お い時間帯ね?。...時はウは再放送をは丈夫かな ずにた?......お昼飯はに?...?ら、しいな♪。......おの、れる、このゲーをしてしよう、かかな?......もお休みちゃんの、特別な日になりかな...やちゃん、今日! 今日今日は!お兄ちゃん。 今日今日ばんは、お兄ちゃん。 今日今日ふん、。 着。、ことはっちな妄想が見見ことね。。笑)今日今日っぱー あ、今日はたのをよ♪ 、、今日ツ。 ツのッパパパンパンパンパンパンパン笑略曲 パラパパンパンパンパンパンパンパンパン後奏)パンけよ!ーしいよがへ の光を!、♪ めん、、に見のるよ なじ空っこ星をいてる♪ 、、、、ぁいんばごくく～♪～いなん～♪～い～～?～ん～～ん～♪～～ん～ん♪♪ ハッ! レバンチョ...イ がいらないよ～えみ♪～いい 、敵な世界だちになれ～ん が心つかせて～ をこおう♪♪アー!フマッ♪♪ よをゆような、かけて! に行こうよよしよう歌♪ にも邪魔から世界へ ときし先に♪よよ♪ ～、～い～い～～♪♪♪～♪～と～ん～けが～ち～きゅ～い～♪ま～わ～す～♪ ロー!ソフマッ ワー♪言葉ブルーに空きな瞳を、 の照らしす鏡の と心をがを具♪のて、心を歌こうよウィーアーソフマッワー♪心でよ誰かのいっぱいる♪♪ハートのスイッチをな～♪あのきよ♪のなればっちゃれ♪ 流ことをを待♪ ぶんん～わ～ん～～♪～ん～の～と～を～けが～ち～きゅ～う～をま～わ～す～♪ ロー!ソフマッ ワー♪ はいらないほほえみあれば♪たちまち素敵なともだちさ～♪心と心を描かせあって♪愛を描おうよウィーアーソフマッワー♪おいアーソフマッワー♪ほ・・・・・・・・ あか、ネームでえるとは思わなかったなかったよね 笑リ)))～を、おおろとして顔は?～?。 ふん、 ああいね、 ばんら～ん～～ そうそうっちっ今日もちゃんの、もんだね～。。 (<unk> ́д<unk> )ノ でもでも..............

In [11]:
tokens = tokenizer.encode("こんにちは", return_tensors="pt").to(device)
tokenizer.decode(tokens[0])

'こんにちは</s>'