In [1]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.1 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 44.7 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 56.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 50.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 15.5 MB/s 
Building wheels for collected packages: ipadic, sacremoses
  Building wheel for ipadi

In [2]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, BertJapaneseTokenizer, BertModel
from torch import cuda
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from transformers import logging


In [3]:
batch_size = 16
max_len = 512

In [4]:
df = pd.read_csv("./drive/MyDrive/Colab_Notebooks/data/livedoor_text.csv")
print(df.shape)
df.head()

(7367, 2)


Unnamed: 0,text,category
0,27日に生放送された日本テレビ「バンクーバー2010」には、女子フィギュアスケートで銀メダル...,7
1,「腐女子」という言葉をご存知でしょうか。\nいわゆる漫画やアニメキャラなどの男性同士の恋愛（...,0
2,展示会イベント恒例のおねいさん写真のコーナーでございます \n\n国内最大級の携帯電話や無線...,6
3,芸能界を引退した島田紳助さんが、今月２８日に公開される映画「犬の首輪とコロッケと」に声だけ出...,2
4,お花に包まれた洋館で、イケメン執事に囲まれながら、ゆったりと過ごす午後のひととき……。女の子...,5


# データセットの作成

In [5]:
class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.y)

  def encode(self, tokenizer, text):
      inputs = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          padding = 'max_length',
          truncation = True
      )
      return inputs

  def __getitem__(self, index):
    text = self.X[index]
    label = self.y[index]
    ids = []
    mask = []
    inputs = self.encode(tokenizer=self.tokenizer, text=text)
    ids.append(torch.LongTensor(inputs['input_ids']))
    mask.append(torch.LongTensor(inputs['attention_mask']))

    return {
      'ids': ids,
      'mask': mask,
      'label': label,
      'text':text,
      # 'userID':userID
    }

In [6]:
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [7]:
X = df["text"].values
y = df["category"].values

In [8]:
X_train_eval, X_test, y_train_eval, y_test = train_test_split(X, y, train_size=0.8)

X_train, X_eval, y_train, y_eval = train_test_split(X_train_eval, y_train_eval, train_size=0.75)

print(len(X_train))
print(len(X_eval))
print(len(X_test))

print(len(y_train))
print(len(y_eval))
print(len(y_test))

4419
1474
1474
4419
1474
1474


In [9]:
dataset_train = CreateDataset(X_train, y_train, tokenizer, max_len=max_len)
dataset_eval = CreateDataset(X_eval, y_eval, tokenizer, max_len=max_len)
dataset_test = CreateDataset(X_test, y_test, tokenizer, max_len=max_len)

print(dataset_train.__len__())
print(dataset_eval.__len__())
print(dataset_test.__len__())

4419
1474
1474


In [10]:
# dataset_train[0]

# データローダの作成

In [11]:
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_eval = DataLoader(dataset_eval, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, pin_memory=True)

# 辞書オブジェクトにまとめる
dataloaders_dict = {"train": dataloader_train, "val": dataloader_eval}

In [16]:
tmp = next(iter(dataloader_train))
print(tmp["ids"])
print(tmp["label"])


[tensor([[    2, 17960,  7803,  ...,    12,  9347,     3],
        [    2,    63,   670,  ...,  1787, 28768,     3],
        [    2,    36,    74,  ...,    14,  3659,     3],
        ...,
        [    2,  1575,   811,  ..., 11137,     9,     3],
        [    2,    91, 25132,  ...,     0,     0,     0],
        [    2,   213,    32,  ...,     0,     0,     0]])]
tensor([5, 4, 2, 1, 1, 4, 7, 8, 0, 6, 2, 4, 0, 0, 7, 7])


# BERTモデル

In [13]:
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", output_attentions=True, output_hidden_states=True)
model.eval()
print('ネットワーク設定完了')

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

ネットワーク設定完了


In [17]:
# BERTでベクトル化する関数を定義

def vectorize_with_bert(net, dataloader):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス：", device)
    print('-----start-------')

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # ミニバッチのサイズ
    batch_size = dataloader.batch_size

    # データローダーからミニバッチを取り出すループ
    for index, batch in enumerate(dataloader):
        # batchはTextとLableの辞書オブジェクト
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        inputs = batch["ids"][0].to(device)  # 文章
        labels = batch["label"].to(device)  # ラベル

        # 順伝搬（forward）計算
        with torch.set_grad_enabled(False):

            # Berに入力
            result = net(inputs)

            # sequence_outputの先頭の単語ベクトルを抜き出す
            vec_0 = result[0]  # 最初の0がsequence_outputを示す
            vec_0 = vec_0[:, 0, :]  # 全バッチ。先頭0番目の単語の全768要素
            vec_0 = vec_0.view(-1, 768)  # sizeを[batch_size, hidden_size]に変換

            # ベクトル化したデータをtorchリストにまとめる
            if index == 0:
                list_text = vec_0
                list_label = labels
            else:
                list_text = torch.cat([list_text, vec_0], dim=0)
                list_label = torch.cat([list_label, labels], dim=0)

    return list_text, list_label

In [18]:
# DataLoaderをベクトル化版に変換
# 少し時間がかかります5分弱

list_text_train, list_label_train = vectorize_with_bert(model, dataloader_train)
list_text_eval, list_label_eval = vectorize_with_bert(model, dataloader_eval)
list_text_test, list_label_test = vectorize_with_bert(model, dataloader_test)

使用デバイス： cuda:0
-----start-------
使用デバイス： cuda:0
-----start-------
使用デバイス： cuda:0
-----start-------


In [23]:
print(list_text_train.size())
print(list_text_eval.size())
print(list_text_test.size())
print(list_label_train.size())
print(list_label_eval.size())
print(list_label_test.size())

torch.Size([4419, 768])
torch.Size([1474, 768])
torch.Size([1474, 768])
torch.Size([4419])
torch.Size([1474])
torch.Size([1474])


In [24]:
# torchのリストをDatasetに変換

from torch.utils.data import TensorDataset

dataset_bert_train = TensorDataset(
    list_label_train.view(-1, 1), list_text_train)
dataset_bert_eval = TensorDataset(list_label_eval.view(-1, 1), list_text_eval)
dataset_bert_test = TensorDataset(list_label_test.view(-1, 1), list_text_test)

In [25]:
# Dataloaderにする
from torch.utils.data import DataLoader

batch_size = 1024

dl_bert_train = DataLoader(
    dataset_bert_train, batch_size=batch_size, shuffle=True, drop_last=True)
# drop_lastは最後のミニバッチがbatch_sizeに足りない場合は無視する

dl_bert_eval = DataLoader(
    dataset_bert_eval, batch_size=batch_size, shuffle=False)
dl_bert_test = DataLoader(
    dataset_bert_test, batch_size=batch_size, shuffle=False)

In [27]:
tmp = next(iter(dl_bert_train))
tmp


[tensor([[2],
         [3],
         [7],
         ...,
         [1],
         [3],
         [6]], device='cuda:0'),
 tensor([[-0.5682,  0.7339, -0.3711,  ..., -0.3918, -0.6459, -0.0686],
         [-0.0223, -0.1312, -0.5425,  ..., -0.0813, -0.2375,  0.2325],
         [-0.7309,  0.7196, -0.7221,  ..., -0.4263, -0.1826,  0.0518],
         ...,
         [ 0.3519, -0.0716, -0.3624,  ..., -0.1390,  0.1413,  0.2433],
         [-0.0604,  0.1525,  0.1461,  ...,  0.0139,  0.0774,  0.1745],
         [-0.3007, -0.0541, -0.1976,  ...,  0.1093, -0.3313,  0.0730]],
        device='cuda:0')]

# IICモデル

In [28]:
import torch.nn as nn
import torch.nn.functional as F

OVER_CLUSTRING_RATE = 10


class NetIIC(nn.Module):
    def __init__(self):
        super(NetIIC, self).__init__()

        # multi-headは今回しない
        self.conv1 = nn.Conv1d(1, 400, kernel_size=768, stride=1, padding=0)
        self.bn1 = nn.BatchNorm1d(400)
        self.conv2 = nn.Conv1d(1, 300, kernel_size=400, stride=1, padding=0)
        self.bn2 = nn.BatchNorm1d(300)
        self.conv3 = nn.Conv1d(1, 300, kernel_size=300, stride=1, padding=0)
        self.bn3 = nn.BatchNorm1d(300)

        self.fc1 = nn.Linear(300, 250)
        self.bnfc1 = nn.BatchNorm1d(250)

        # livedoorニュースの9カテゴリに対応するかな？と期待する9分類
        self.fc2 = nn.Linear(250, 9)

        # overclustering
        # 実際の想定よりも多めにクラスタリングさせることで、ネットワークで微細な変化を捉えられるようにする
        self.fc2_overclustering = nn.Linear(250, 9*OVER_CLUSTRING_RATE)

    def forward(self, x):
        x = x.view(x.size(0), 1, -1)
        x = F.relu(self.bn1(self.conv1(x)))

        x = x.view(x.size(0), 1, -1)
        x = F.relu(self.bn2(self.conv2(x)))

        x = x.view(x.size(0), 1, -1)
        x = F.relu(self.bn3(self.conv3(x)))

        x = x.view(x.size(0), -1)
        x_prefinal = F.relu(self.bnfc1(self.fc1(x)))

        # multi-headは使わず
        y = F.softmax(self.fc2(x_prefinal), dim=1)
        y_overclustering = F.softmax(self.fc2_overclustering(
            x_prefinal), dim=1)  # overclustering

        return y, y_overclustering


In [29]:
import torch.nn.init as init


def weight_init(m):
    """重み初期化"""
    if isinstance(m, nn.Conv1d):
        init.normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.BatchNorm1d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.Linear):
        # Xavier
        # init.xavier_normal_(m.weight.data)

        # He
        init.kaiming_normal_(m.weight.data)

        if m.bias is not None:
            init.normal_(m.bias.data)

In [30]:
# IISによる損失関数の定義
# 参考：https://github.com/RuABraun/phone-clustering/blob/master/mnist_basic.py
import sys


def compute_joint(x_out, x_tf_out):
    bn, k = x_out.size()
    assert (x_tf_out.size(0) == bn and x_tf_out.size(1) == k), '{} {} {} {}'.format(
        bn, k, x_tf_out.size(0), x_tf_out.size(1))

    p_i_j = x_out.unsqueeze(2) * x_tf_out.unsqueeze(1)  # bn, k, k
    p_i_j = p_i_j.sum(dim=0)  # k, k
    p_i_j = (p_i_j + p_i_j.t()) / 2.  # symmetrise
    p_i_j = p_i_j / p_i_j.sum()  # normalise
    return p_i_j


def IID_loss(x_out, x_tf_out, EPS=sys.float_info.epsilon):
    # has had softmax applied
    bs, k = x_out.size()
    p_i_j = compute_joint(x_out, x_tf_out)
    assert (p_i_j.size() == (k, k))

    p_i = p_i_j.sum(dim=1).view(k, 1).expand(k, k)
    p_j = p_i_j.sum(dim=0).view(1, k).expand(k, k)

    # avoid NaN losses. Effect will get cancelled out by p_i_j tiny anyway
    # これはPyTorchのバージョン1.3以上だとエラーになる
    # https://discuss.pytorch.org/t/pytorch-1-3-showing-an-error-perhaps-for-loss-computed-from-paired-outputs/68790/3
    #p_i_j[(p_i_j < EPS).data] = EPS
    #p_j[(p_j < EPS).data] = EPS
    #p_i[(p_i < EPS).data] = EPS

    p_i_j = torch.where(p_i_j < EPS, torch.tensor(
        [EPS], device=p_i_j.device), p_i_j)
    p_j = torch.where(p_j < EPS, torch.tensor([EPS], device=p_j.device), p_j)
    p_i = torch.where(p_i < EPS, torch.tensor([EPS], device=p_i.device), p_i)

    # https://qiita.com/Amanokawa/items/0aa24bc396dd88fb7d2a
    # 参考に、重みalphaを追加

    alpha = 2.0
    loss = (- p_i_j * (torch.log(p_i_j) - alpha *
                       torch.log(p_j) - alpha*torch.log(p_i))).sum()

    return loss

In [31]:
# データにノイズを加える関数の定義
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tensor_std = list_text_train.std(dim=0).to(device)


def perturb_data(x):
    y = x.clone()
    noise = torch.randn(len(tensor_std)).to(device)*tensor_std*2.0
    noise = noise.expand(x.shape[0], -1)
    y += noise

    return y

# IICのネットワークを学習させる

In [32]:
# 学習関数の定義


def train(total_epoch, model, train_loader, optimizer, device):

    # ネットワークを訓練モードに
    model.train()

    # 学習率のスケジューラーCosAnnealing
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=2, T_mult=2, eta_min=0)

    for epoch in range(total_epoch):
        for batch_idx, (target, data) in enumerate(train_loader):

            # 学習率変化
            scheduler.step()

            data_perturb = perturb_data(data)  # ノイズを与え、変換したデータを作る

            # GPUに送れる場合は送る
            data = data.to(device)
            data_perturb = data_perturb.to(device)

            # 最適化関数の初期化
            optimizer.zero_grad()

            # ニューラルネットワークへ入れる
            output, output_overclustering = model(data)
            output_perturb, output_perturb_overclustering = model(data_perturb)

            # 損失の計算
            loss1 = IID_loss(output, output_perturb)
            loss2 = IID_loss(output_overclustering,
                             output_perturb_overclustering)
            loss = loss1 + loss2

            # 損失を減らすように更新
            loss.backward()
            optimizer.step()

        # ログ出力
        if epoch % 50 == 0:
            print('Train Epoch {} \tLoss1: {:.6f} \tLoss2: {:.6f} \tLoss_total: {:.6f}'.format(
                epoch, loss1.item(), loss2.item(), loss1.item()+loss2.item()))

    return model, optimizer

In [None]:
# 学習の実施（5分弱）
total_epoch = 1000

# モデル
model = NetIIC()
model.apply(weight_init)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)  # 最適化関数

model_trained, optimizer = train(
    total_epoch, model, dl_bert_train, optimizer, device)

Train Epoch 0 	Loss1: -3.258136 	Loss2: -7.855247 	Loss_total: -11.113383
Train Epoch 50 	Loss1: -6.112966 	Loss2: -10.055174 	Loss_total: -16.168139
Train Epoch 100 	Loss1: -6.483746 	Loss2: -11.232649 	Loss_total: -17.716395
Train Epoch 150 	Loss1: -6.503678 	Loss2: -11.814453 	Loss_total: -18.318131
Train Epoch 200 	Loss1: -6.550383 	Loss2: -12.365322 	Loss_total: -18.915705
Train Epoch 250 	Loss1: -6.554901 	Loss2: -12.446652 	Loss_total: -19.001554
