<h3>ローカルでモデルをトレーニングする</h3>

<h4>エクスポートしたデータを読込みます</h4>

In [None]:
import numpy as np
npz = np.load('docdata1.npz')
print(npz.files)
x = npz['arr_0']
y = npz['arr_1']

<h4>読込んだ内容を確認します</h4>

In [None]:
print(x.shape)
print(y.shape)
print(x[0])
print(y[0])

<h4>モデル学習のためのデータ準備をします</h4>

- torch 関連のパッケージをインポートします

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.functional import accuracy

import torch.utils.data
from torch.utils.data import DataLoader

<h4>torch tensor に変換します</h4>

In [None]:
x = torch.tensor(x, dtype=torch.int64)
y = torch.tensor(y, dtype=torch.int64)

In [None]:
print(len(x))
print(x)

In [None]:
print(len(y))
print(y)

In [None]:
print(type(x), x.dtype)
print(type(y), y.dtype)

<h3>単語ID表現の文章の分散表現を試してみます</h3>
<h4>参考資料</h4>

- [EMBEDDING](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
<br></br>
<br>この例では一つの単語 ID が 10次元のベクトルに変換されてます

In [None]:
sample_embeddings = nn.Embedding(num_embeddings=7295, embedding_dim=10, padding_idx=0)
sample = sample_embeddings(x[0])
print(sample)
print('\n文章のサイズ', len(sample))

<h3>データセットを作成します</h3>

In [None]:
dataset = torch.utils.data.TensorDataset(x, y)
dataset

In [None]:
len(dataset)

<h3>トレーニング、検証、テスト、それぞれのデータセットに分割します</h3>

In [None]:
num_train = int(len(dataset) * 0.6)
num_validation = int(len(dataset) * 0.2)
num_test = len(dataset) - num_train - num_validation

In [None]:
torch.manual_seed(0)
train, validation, test = torch.utils.data.random_split(dataset, [num_train, num_validation, num_test])

In [None]:
len(train), len(validation), len(test)

<h3>Dataloader を作成します</h3>
<h4>参考資料</h4>

- [TORCH.UTILS.DATA](https://pytorch.org/docs/stable/data.html)
- [LIGHTNINGDATAMODULE](https://pytorch-lightning.readthedocs.io/en/stable/extensions/datamodules.html)
- [TORCHTEXT](https://pytorch.org/text/stable/index.html)

In [None]:
batch_size = 128
num_workers = 4

train_dataloader = DataLoader(train,      batch_size=batch_size, shuffle=True,  num_workers=num_workers)
val_dataloader  =  DataLoader(validation, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_dataloader =  DataLoader(test,       batch_size=batch_size, shuffle=False, num_workers=num_workers)

<h3>モデル学習に使うクラスを定義します</h3>
<h4>参考資料</h4>

- [LIGHTNINGMODULE](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html)
- [LOGGING](https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html)
- [TORCHMETRICS](https://torchmetrics.readthedocs.io/en/latest/?_ga=2.242351115.847291179.1621688579-221285708.1621323678)

In [None]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger

In [None]:
# verify version
pl.__version__

In [None]:
class LitTrainClassifier(pl.LightningModule):

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

In [None]:
class LitValidationClassifier(pl.LightningModule):

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        y_pred = torch.argmax(y_hat, dim=1)
        loss = F.cross_entropy(y_hat, y)
        acc = accuracy(y_pred, y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log('val_acc', acc, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

In [None]:
class LitTestClassifier(pl.LightningModule):

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        y_pred = torch.argmax(y_hat, dim=1)
        loss = F.cross_entropy(y_hat, y)
        acc = accuracy(y_pred, y)
        self.log('test_loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log('test_acc', acc, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

In [None]:
class LSTMModel(LitTrainClassifier, LitValidationClassifier, LitTestClassifier):

    def __init__(self, vocab_size=7295 , embedding_dim=200, hidden_dim=100, layer_dim=2, output_dim=9, drop_out=0.3):
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        
        self.lstm = torch.nn.LSTM(input_size = embedding_dim,
                                  hidden_size = hidden_dim,
                                  num_layers = layer_dim,
                                  dropout = drop_out,
                                  batch_first=True)
            
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
    def forward(self, x):
        x = self.embeddings(x)        
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])
        return out

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)
    

<h4>モデル学習を開始します</h4>

In [None]:
mlf_logger = MLFlowLogger(
    experiment_name="default",
    tracking_uri="file:./ml-runs"
)

#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
#torch.cuda.empty_cache()
torch.manual_seed(0)

net = LSTMModel()

#trainer = Trainer(gpus=1, max_epochs=20, logger=mlf_logger, callbacks=[EarlyStopping(monitor='val_loss')])
trainer = Trainer(max_epochs=20, logger=mlf_logger, callbacks=[EarlyStopping(monitor='val_loss')])

trainer.fit(net, train_dataloader=train_dataloader, val_dataloaders=val_dataloader)

<h4>検証データによるメトリックを確認します</h4>

In [None]:
val_metric = trainer.callback_metrics
print(val_metric)

In [None]:
print('val_loss: ', val_metric['val_loss'].item())
print('val_acc: ', val_metric['val_acc'].item())

<h4>テストデータによる精度を確認します</h4>

In [None]:
test_metric = trainer.test(test_dataloaders=test_dataloader)

In [None]:
metrics = trainer.callback_metrics
print('val_loss: ', metrics['val_loss'].item())
print('val_acc: ', metrics['val_acc'].item())
print('test_loss: ', metrics['test_loss'].item())
print('test_acc: ', metrics['test_acc'].item())

<h4>モデルを保存します</h4>

In [None]:
os.makedirs('./models', exist_ok=True)
torch.save(net.state_dict(), './models/text_classifier_lstm_local.pt')