## ここは必ず実行する．

In [None]:
!nvidia-smi

モデルやデータセットを保存しておきたい場合はGoogle Driveで作業をしてください．

In [2]:
drive_mount = False #@param {type:"boolean"}
if drive_mount:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/your_folder

## まだcsvファイルを作っていない場合はここからスタート

In [None]:
# livedoorニュースコーパスをインストール
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
# 解凍
!tar -zxf ldcc-20140209.tar.gz
!rm ldcc-20140209.tar.gz

In [None]:
# 中身を見る
!head text/dokujo-tsushin/dokujo-tsushin-4778030.txt

livedoorニュースコーパスでは，テキストファイルの3行目にタイトルが収録されています．

In [6]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
data_path = './text/'

In [8]:
data_files = os.listdir(data_path)
# テキストディレクトリ内のデータフォルダのみを抽出する．
data_dirs = [f for f in data_files if os.path.isdir(os.path.join(data_path, f))]
print(data_dirs)

['dokujo-tsushin', 'peachy', 'topic-news', 'it-life-hack', 'movie-enter', 'livedoor-homme', 'sports-watch', 'kaden-channel', 'smax']


In [None]:
# 各フォルダのLICENSE.txtは取り除くようにする．
drop_files = ['LICENSE.txt']

def load_titles(data_dir, label=None):
  titles = []
  text_path = os.path.join(data_path, data_dir)
  text_files = os.listdir(text_path)
  for text_file in text_files:
    if text_file in drop_files:
      continue
    with open(os.path.join(text_path, text_file), 'r', encoding='utf8') as f:
      # ファイルを読み込む
      lines = f.read().strip().split('\n')
      titles.append(lines[2])
  if label is None:
    return titles
  else:
    labels = [label for i in range(len(titles))]
    return titles, labels

# 確認
t, l = load_titles('dokujo-tsushin', 1)
print(t[:5])
print(l[:5])

In [10]:
all_titles = []
all_labels = []
label2genre = {}

for i, data_dir in enumerate(data_dirs):
  titles, labels = load_titles(data_dir, i)
  all_titles.extend(titles)
  all_labels.extend(labels)
  label2genre[i] = data_dir
  
print(len(all_titles))
print(len(all_labels))

7367
7367


In [11]:
import pickle
# label2genreを保存しておく
with open('label2genre.pkl', 'wb') as f:
  pickle.dump(label2genre, f)

In [12]:
# titleとラベルが関連付けられた，データフレームを作成
livedoor_df = pd.DataFrame({
    'title' : all_titles,
    'label' : all_labels
})

# trainとtestに自動で分割しておく
train, test = train_test_split(livedoor_df, 
                             test_size=0.2, random_state=42, 
                             shuffle=True, stratify=livedoor_df['label'])

In [13]:
# 保存する
train.to_csv('./livedoor_train.csv', index=False)
test.to_csv('./livedoor_test.csv', index=False)

## すでにファイルを保存している場合はここからスタート

In [None]:
# transformerとsimpletransformerをインストール(終わったら再起動の必要があります)
!pip install transformers
!pip install simpletransformers

In [5]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score

from simpletransformers.classification import ClassificationArgs, ClassificationModel

import torch
import tqdm
#import MeCab
import pickle

In [6]:
train = pd.read_csv('./livedoor_train.csv')
test = pd.read_csv('./livedoor_test.csv')

In [None]:
print(train.shape)
display(train.head())
train['label'].value_counts()

In [None]:
print(test.shape)
display(test.head())
test['label'].value_counts()

In [None]:
import torch

cuda_available = torch.cuda.is_available()
# モデルの引数を指定
model_args = ClassificationArgs()
model_args.num_train_epochs = 5
model_args.learning_rate = 1e-4
# batch_sizeはOOMが出たら減らしてください
model_args.train_batch_size = 16
model_args.overwrite_output_dir = True
model_args.save_model_every_epoch = False

model_args

In [None]:
# modelを定義
model = ClassificationModel(
    'bert', 'cl-tohoku/bert-base-japanese-v2',
    num_labels=9,
    args=model_args
)

In [None]:
model.train_model(train, acc=accuracy_score)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test)
result

In [None]:
#macro f1scoreを求める

def softmax(x):
    max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
    e_x = np.exp(x - max) #subtracts each row with its max value
    sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
    f_x = e_x / sum 
    return f_x

test_preds = softmax(model_outputs).argmax(1)
f1_score(test['label'], test_preds, average='macro')

In [None]:
# tensorboardで確認
%load_ext tensorboard
%tensorboard --logdir runs

In [None]:
input_title = '' #@param {type:"string"}
with open('label2genre.pkl', 'rb') as f:
  label2genre = pickle.load(f)
predictions, raw_outputs = model.predict(input_title)
print(label2genre[predictions[0]])