In [2]:
import math
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AdamW, BertJapaneseTokenizer, BertModel
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

%matplotlib inline

# Load data

In [36]:
all_data = pd.read_csv('./dataset/csv/detail_text.csv')
data = all_data.reset_index(drop=True)

In [37]:
title_text = []
for i in range(len(all_data)):
    title_text.append(str(all_data.iloc[i].title) + '。' + str(all_data.iloc[i].text))

In [38]:
data = data[['ncode', 'genre']]
data['title_text'] = title_text

In [39]:
def preprocessing(text):
    text = re.sub(' ', '', text)
    text = re.sub('　', '', text)
    text = re.sub('(\n)+', '\n', text)
    text = re.sub('(\r)+', '\r', text)
    return text

In [40]:
data['title_text'] = data.title_text.apply(preprocessing)

In [41]:
data.genre = data.genre.astype('category')

In [42]:
data['genre_category'] = data.genre.cat.codes

In [47]:
train_data, test_data = train_test_split(data, test_size=0.2)
test_data, val_data = train_test_split(test_data, test_size = 0.5)

In [51]:
train_data.to_csv('../dataset/csv/arcface_train.csv', index=False)
test_data.to_csv('../dataset/csv/arcface_test.csv', index=False)
val_data.to_csv('../dataset/csv/arcface_val.csv', index=False)

# Architecture definition

In [2]:
class BERT(nn.Module):
    
    def __init__(self, pretrained, h_dim):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained)
        self.fc = nn.Linear(768, h_dim)
        
    
    def forward(self, ids, mask):
        _, output = self.bert(ids, attention_mask=mask)
        output = self.fc(output)
        return output

In [3]:
class ArcMarginProduct(nn.Module):

    def __init__(self, in_features, out_features, s=30.0, m=0.50, device='cpu'):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.device = device
        self.weight = torch.nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = torch.zeros(cosine.size(), device=self.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 

        return output

In [4]:
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, tokenizer, max_length):
        self.x = np.array(df['title_text'])
        self.y = np.array(df['genre_category']).astype(np.int64)
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self, index):
        text = self.x[index]
        label = self.y[index]
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return torch.LongTensor(ids), torch.LongTensor(mask), torch.tensor(label)

In [5]:
def create_dataloader(df, tokenizer, max_length, batch_size):
    dataset = MyDataset(df, tokenizer, max_length)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    return dataloader

# Data prepare

In [6]:
pretrained = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained)
max_length = 128
batch_size = 64

In [7]:
train_data = pd.read_csv('../dataset/csv/arcface_train.csv')
train_dataloader = create_dataloader(train_data, tokenizer, max_length, batch_size)

# Training

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

num_epochs = 100
h_dim = 64
lr = 1e-5
num_classes = len(set(train_data.genre_category))

bert = BERT(pretrained, h_dim).to(device).train()
metric_fc = ArcMarginProduct(in_features=h_dim, out_features=num_classes, device=device).to(device).eval()
criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW([
    {'params': bert.parameters(), 'lr': lr},
    {'params': metric_fc.parameters(), 'lr': lr},
])

loss_list = []

c = 0
for epoch in range(num_epochs):
    for ids, mask, label in train_dataloader:
        ids.to(device)
        mask.to(device)
        label.to(device)
        c += 1
        print(c)
        optimizer.zero_grad()
        

        feature = bert(ids, mask)
        output = metric_fc(feature, label)
        loss = criterion(output, label)
        loss_list.append(loss.item())
        
        loss.backward()
        optimizer.step()
        
        clear_output()
        
        if c % 100 == 0:
            np.save('./loss.npy', np.array(loss_list))