# 実行時間

In [1]:
import os, sys, glob
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib

from sklearn.metrics import precision_score, recall_score
from transformers import BertForSequenceClassification, BertTokenizerFast, BertJapaneseTokenizer, Trainer, TrainingArguments
from transformers import pipeline, Pipeline
from datasets import load_dataset, Dataset

pd.options.display.float_format = "{:.4f}".format


In [2]:
def load_assets(model_path:str) -> Tuple[Trainer, Dataset]:
  """モデルとデータセットを読み込む
  """
  # tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
  tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-large-japanese-v2')
  dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')

  def _tokenize(batch):
      return tokenizer(batch['text'], padding='max_length', truncation=True)

  # データセットを準備する
  eval_dataset =  dataset['validation'].map(_tokenize, batched=True)
  eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

  # モデルを読み込む
  model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
  trainer = Trainer(
      model=model
  )
  return trainer, eval_dataset

def predict(sentiment_analyzer:Pipeline, eval_dataset:Dataset) -> Dict[np.ndarray, np.ndarray]:
# def predict(trainer:Trainer, eval_dataset:Dataset) -> Dict[np.ndarray, np.ndarray]:
  # metricsを計算する
  # print(trainer.evaluate(eval_dataset))
  # 予測する
  predictions = sentiment_analyzer.predict(eval_dataset)
  return {'predict_proba':predictions.predictions, 'labels':np.argmax(predictions.predictions, axis=1)}

def evaluate(y_true, y_pred, method='macro', verbose=True):
  precision_macro = precision_score(y_true, y_pred, average=method)
  recall_macro = recall_score(y_true, y_pred, average=method)
  f1 = 2 * (precision_macro * recall_macro) / (precision_macro + recall_macro)
  if verbose:
    print(f'precision_macro: {precision_macro:.3f}\nrecall_macro: {recall_macro:.3f}\nf1: {f1:.3f}')
    display(pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
  return precision_macro, recall_macro, f1

def calculate_f1_score(model_name: str, eval_df: pd.DataFrame):
  prefix = model_name.split('/')[-1]
  prediction_results = predict(*load_assets(model_name))
  eval_df[f'pred_{prefix}'] = prediction_results['labels']
  _, _, f1 = evaluate(eval_df['label'], eval_df[f'pred_{prefix}'])
  return f1

In [3]:
eval_dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')['validation']
print(eval_dataset)
eval_df = eval_dataset.to_pandas()
eval_df.head()

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 325
})


Unnamed: 0,text,label
0,以前より高くなっている 。,0
1,小岩井の生乳100％ヨーグルトを安くしてくださって嬉しいです これからも買い続けるのでお値段...,1
2,ＱＵＩＣpayで支払いで10%OＦＦにするならば、majicaにチャージして支払いをした場合...,1
3,9月の特売品がたくさん有って良かったです!,1
4,是非気軽にいただける飲食店を増やしてほしいです！フードコートでも店舗でも。,1


In [6]:
from transformers import pipeline
# ref:https://huggingface.co/docs/transformers/main_classes/pipelines

model = BertForSequenceClassification.from_pretrained('./results', num_labels=3)
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-large-japanese-v2')
sentiment_analyzer = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

- GPU: 3104件で1分10秒程度, 173037件で1時間5分程度
- CPU: 3104件で4分, 173037件で3時間45分程度

In [9]:
voc_all_df = pd.read_csv(
    os.path.join('..\\1_insight_summary\\data\\STARS_4月\\data.csv'), encoding='cp932')
target_store_list = ['MEGAドン・キホーテUNY東海通店', 'MEGAドン・キホーテUNY 横浜大口店', 'ドン・キホーテ 川西店', 'ドン・キホーテ 六本木店','小樽店']

_df = voc_all_df.query('store_name in @target_store_list')

In [11]:
_df.shape

(3104, 16)

In [10]:
_df['sentiment_results'] = _df['answer_question'].apply(lambda x: sentiment_analyzer(x, return_all_scores=True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _df['sentiment_results'] = _df['answer_question'].apply(lambda x: sentiment_analyzer(x, return_all_scores=True))


In [21]:
_df['sentiment_results'].iloc[0]

[[{'label': 'LABEL_0', 'score': 0.0009338804520666599},
  {'label': 'LABEL_1', 'score': 0.9943874478340149},
  {'label': 'LABEL_2', 'score': 0.004678693599998951}]]