In [1]:
import math
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import BertJapaneseTokenizer, BertModel
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from IPython.display import clear_output
%matplotlib inline

In [2]:
class BERT(nn.Module):
    
    def __init__(self, pretrained, h_dim):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained)
        self.fc = nn.Linear(768, h_dim)
        
    
    def forward(self, ids, mask):
        _, output = self.bert(ids, attention_mask=mask)
        output = self.fc(output)
        return output

In [3]:
class DataLoader(object):
    
    def __init__(self, df, tokenizer, max_length):
        self.x = np.array(df)[:, 0]
        self.y = np.array(df)[:, 1:].astype(np.int64)
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.genre_list = list(df.columns[1:])
        
    
    def __call__(self, index):
        text = self.x[index]
        label = self.genre_list[np.argmax(self.y[index])]
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return text, label, torch.LongTensor(ids).unsqueeze(0), torch.LongTensor(mask).unsqueeze(0)

In [4]:
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, tokenizer, max_length):
        self.x = np.array(df)[:, 0]
        self.y = np.array(df)[:, 1:].astype(np.int64)
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self, index):
        text = self.x[index]
        label = self.y[index]
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return torch.LongTensor(ids), torch.LongTensor(mask), torch.argmax(torch.tensor(label))

In [6]:
pretrained = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained)
max_length = 512
batch_size = 64
h_dim = 64

In [7]:
bert = BERT(pretrained, h_dim)

In [8]:
bert.load_state_dict(torch.load('./model_64hdim/bert5000.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [14]:
df = pd.read_csv('./train_genre.csv').drop_duplicates('story')
df = df[df['all_point']>=5.0]
df = df.drop('Unnamed: 0', axis=1).drop('Unnamed: 0.1', axis=1).drop('ncode', axis=1).drop('all_point', axis=1)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
dataloader = DataLoader(df, tokenizer, max_length)

feature_list = []
text_list = []
label_list = []

for index in range(1000):
    print(index)
    text, label, ids, mask = dataloader(index)
    output = bert(ids, mask).view(-1).detach().numpy()
    feature_list.append(output)
    text_list.append(text) 
    label_list.append(label)

In [99]:
cos_similarity_list = []

base_idx = 105
base_feature = feature_list[base_idx]
for feature in feature_list:
    cos_similarity = np.dot(base_feature, feature) / (np.linalg.norm(base_feature) * np.linalg.norm(feature))
    cos_similarity_list.append(cos_similarity)
    
indices = np.argsort(cos_similarity_list)[::-1]

In [None]:
print(np.array(label_list)[indices[:10]])