<a href="https://colab.research.google.com/github/maki8maki/DLBasics2023_colab/blob/master/lecture06_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第6回講義 宿題

## 課題
RNNを用いてIMDbのsentiment analysisを実装してみましょう．

ネットワークの形などに制限はとくになく，今回のLessonで扱った内容以外の工夫も組み込んでもらって構いません．

## 目標値
F値：0.85

## ルール
- 以下のセルで指定されている`x_train`, `t_train`以外の学習データは使わないでください．

## 提出方法
- 2つのファイルを提出していただきます．
  1. テストデータ `x_test` に対する予測ラベルを`submission_pred.csv`として保存し，Omnicampusの宿題から「第6回 回帰結合型ニューラルネットワーク」を選択して提出してください．
  2. それに対応するpythonのコードを`submission_code.py`として保存し，Omnicampusの宿題から「第6回 回帰結合型ニューラルネットワーク (code)」を選択して提出してください．
    - セルに書いたコードを.py形式で保存するためには%%writefileコマンドなどを利用してください．
    - writefileコマンドではファイルの保存のみが行われセル内のpythonコード自体は実行されません．そのため，実際にコードを走らせる際にはwritefileコマンドをコメントアウトしてください．


- コードの内容を変更した場合は，1と2の両方を提出し直してください．

- なお採点は1で行い，2はコードの確認用として利用します．(成績優秀者はコード内容を公開させていただくかもしれません)


## 評価方法

- 予測ラベルの`t_test`に対するF値で評価します．
- 即時採点しLeader Boardを更新します．（採点スケジュールは別アナウンス）
- 締切時の点数を最終的な評価とします．



### ドライブのマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## データの読み込み（このセルは修正しないでください）

In [2]:
!pip install portalocker

import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext import datasets
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import string
import re
from typing import List, Union

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


# 学習データ
x_train = np.load('drive/MyDrive/Colab Notebooks/DLBasics2023_colab/Lecture06/data/x_train.npy', allow_pickle=True)
t_train = np.load('drive/MyDrive/Colab Notebooks/DLBasics2023_colab/Lecture06/data/t_train.npy', allow_pickle=True)

# 検証データを取る
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.2, random_state=seed)
    
# テストデータ
x_test = np.load('drive/MyDrive/Colab Notebooks/DLBasics2023_colab/Lecture06/data/x_test.npy', allow_pickle=True)


def text_transform(text: List[int], max_length=256):
    # <BOS>はすでに1で入っている．<EOS>は2とする．
    text = text[:max_length - 1] + [2]

    return text, len(text)

def collate_batch(batch):
    label_list, text_list, len_seq_list = [], [], []
    
    for sample in batch:
        if isinstance(sample, tuple):
            label, text = sample

            label_list.append(label)
        else:
            text = sample.copy()
            
        text, len_seq = text_transform(text)
        text_list.append(torch.tensor(text))
        len_seq_list.append(len_seq)
        
    # NOTE: 宿題用データセットでは<PAD>は3です．
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3).T, torch.tensor(len_seq_list)


word_num = np.concatenate(np.concatenate((x_train, x_test))).max()
print(f"単語種数: {word_num}")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
単語種数: 88586


## 実装

In [3]:
batch_size = 128

train_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_train, x_train)],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch,
)
valid_dataloader = DataLoader(
    [(t, x) for t, x in zip(t_valid, x_valid)],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)
test_dataloader = DataLoader(
    x_test,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
)

In [23]:
def torch_log(x):
    return torch.log(torch.clamp(x, min=1e-10))

class SequenceTaggingNet(nn.Module):
    def __init__(self, word_num, emb_dim, hid_dim):
        super().__init__()
        self.emb = nn.Embedding(word_num, emb_dim)
        self.bigru = nn.GRU(emb_dim, hid_dim, batch_first=True, bidirectional=True, num_layers=2, dropout=0.25)
        self.linear = nn.Linear(hid_dim*2, 1) # ForwardとBackwardの出力をconcatしたものを渡すので2倍
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.25)
    
    def forward(self, x, len_seq_max=0, len_seq=None, init_state=None):
        h = self.dropout1(self.emb(x)) # (batch_size, seq_length, emb_dim)

        if len_seq_max > 0:
            out, _ = self.bigru(h[:, 0:len_seq_max, :], init_state)
        else:
            out, _ = self.bigru(h, init_state) # (batch_size, seq_length, hid_dim*2)
        
        h = out.transpose(0, 1)

        if len_seq is not None:
            h = h[len_seq - 1, list(range(len(x))), :]
        else:
            h = h[-1]

        y = self.linear(self.dropout2(h))
        
        return y

def calc_loss(y, t):
    return -torch.mean(t * torch_log(y) + (1-t) * torch_log(1-y))

In [24]:
emb_dim = 100
hid_dim = 50
n_epochs = 50
device = 'cuda'

net = SequenceTaggingNet(word_num, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters())

min_loss = torch.inf
patience = 20 # 許容できる非改善エポック数
n_worsening = 0 # 連続非改善エポック数

for epoch in range(n_epochs):
    losses_train = []
    losses_valid = []

    net.train()
    n_train = 0
    acc_train = 0
    for label, line, len_seq in train_dataloader:
        net.zero_grad()

        t = label.to(device)
        x = line.to(device) # (batch, time)
        len_seq.to(device)

        h = net(x, torch.max(len_seq), len_seq)
        y = torch.sigmoid(h).squeeze()

        loss = calc_loss(y, t)
        loss.backward()
        optimizer.step()

        losses_train.append(loss.tolist())

        n_train += t.size()[0]

    # Valid
    t_valid = []
    y_pred = []
    net.eval()
    for label, line, len_seq in valid_dataloader:
        t = label.to(device)
        x = line.to(device)
        len_seq.to(device)

        h = net(x, torch.max(len_seq), len_seq)
        y = torch.sigmoid(h).squeeze()

        loss = calc_loss(y, t)
        pred = y.round().squeeze()

        t_valid.extend(t.tolist())
        y_pred.extend(pred.tolist())

        losses_valid.append(loss.tolist())

    if np.mean(losses_valid) < min_loss:
        min_loss = np.mean(losses_valid)
        state_dict = net.state_dict()
        n_worsening = 0
    else:
        n_worsening += 1
    print('EPOCH: {}, Train Loss: {:.3f}, Valid Loss: {:.3f}, Validation F1: {:.3f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
        f1_score(t_valid, y_pred, average='macro')
    ))
    if n_worsening >= patience:
        net.load_state_dict(state_dict)
        break

EPOCH: 0, Train Loss: 0.646, Valid Loss: 0.567, Validation F1: 0.707
EPOCH: 1, Train Loss: 0.539, Valid Loss: 0.486, Validation F1: 0.752
EPOCH: 2, Train Loss: 0.423, Valid Loss: 0.376, Validation F1: 0.831
EPOCH: 3, Train Loss: 0.332, Valid Loss: 0.357, Validation F1: 0.846
EPOCH: 4, Train Loss: 0.277, Valid Loss: 0.308, Validation F1: 0.874
EPOCH: 5, Train Loss: 0.240, Valid Loss: 0.318, Validation F1: 0.880
EPOCH: 6, Train Loss: 0.208, Valid Loss: 0.298, Validation F1: 0.886
EPOCH: 7, Train Loss: 0.184, Valid Loss: 0.351, Validation F1: 0.869
EPOCH: 8, Train Loss: 0.164, Valid Loss: 0.321, Validation F1: 0.888
EPOCH: 9, Train Loss: 0.141, Valid Loss: 0.358, Validation F1: 0.879
EPOCH: 10, Train Loss: 0.127, Valid Loss: 0.348, Validation F1: 0.891
EPOCH: 11, Train Loss: 0.111, Valid Loss: 0.324, Validation F1: 0.893
EPOCH: 12, Train Loss: 0.097, Valid Loss: 0.372, Validation F1: 0.888
EPOCH: 13, Train Loss: 0.090, Valid Loss: 0.359, Validation F1: 0.888
EPOCH: 14, Train Loss: 0.078, 

In [25]:
net.eval()

y_pred = []
for _, line, len_seq in test_dataloader:

    x = line.to(device)
    len_seq.to(device)

    h = net(x, torch.max(len_seq), len_seq)
    y = torch.sigmoid(h).squeeze()

    pred = y.round().squeeze()  # 0.5以上の値を持つ要素を正ラベルと予測する

    y_pred.extend(pred.tolist())


submission = pd.Series(y_pred, name='label')
submission.to_csv('drive/MyDrive/Colab Notebooks/DLBasics2023_colab/Lecture06/submission_pred.csv', header=True, index_label='id')