# 実験準備

## 環境構築

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [None]:
!pip install fugashi ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fugashi
  Downloading fugashi-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.9/615.9 KB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556723 sha256=a0bf776677a4453426fec300e95a394c5f68eb2af5794bdd41d8a0d1e50b37d4
  Stored in directory: /root/.cache/pip/wheels/45/b7/f5/a21e68db846eedcd00d69e37d60bab3f68eb20b1d99cdff652
Successfully built ipadic
Installing collected packa

In [None]:
!pip install bert-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-score
  Downloading bert_score-0.3.12-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.12


In [None]:
import pandas as pd
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM, pipeline
from transformers import AdamW

## モデルのダウンロード

In [None]:
# Ref: https://huggingface.co/transformers/training.html#pytorch

model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) 

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [None]:
# GPUが使えれば利用する設定
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### お試し実行

In [None]:
texts='受験とかけまして、理科の実験とときます。その心は、どちらも、「しけんかん」（試験官、試験管）が[MASK]でしょう。'

In [None]:
encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask'] #トークンの存在を示す

out = model(input_ids)

In [None]:

# マスクの位置を抽出
masked_position = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero() #これでマスク部分の抽出ができるらしい
masked_pos = [mask.item() for mask in masked_position ]
last_hidden_state = out[0].squeeze()

list_of_list =[]
topk = 5 # 上位5件
for index,mask_index in enumerate(masked_pos):
  mask_hidden_state = last_hidden_state[mask_index] # 詳細結果の参照
  idx = torch.topk(mask_hidden_state, k=topk, dim=0)[1] # 予測結果の上位を出力
  words = [tokenizer.decode(i.item()).strip() for i in idx]
  list_of_list.append(words)
  print ("Guesses : ",words)

Guesses :  ['あ る', '必 要', '好 き', 'い い', '中 心']


## データ読み込み

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
folda_pass = '/content/drive/MyDrive/実験/learn_data/'
filename = 'original_double.txt'
with open(folda_pass+filename)as f:
  original = f.readlines()
  print(original[0])

疲れてヘトヘトとかけまして、空手家とときます。その心は、どちらも、「いたわり」（労り、板割り）が必要です。



In [None]:
filename = 'Double_MASK.txt'
with open(folda_pass+filename)as f:
  double_mask = f.readlines()
  print(double_mask[0])

疲れてヘトヘトとかけまして、空手家とときます。その心は、どちらも、「いたわり」(労り、板割り)[MASK][MASK]です。



## データ整理

In [None]:
print(len(original))
print(len(double_mask))

207
207


In [None]:
#先にテストデータ、評価データを作る
#8:1:1(train_dataは後に後に3倍になるため倍になるため*3する)
ACC_ratio = (len(original)//10)*1
test_ratio = (len(original)//10)*1 + ACC_ratio
#ACC
Acc_docs_double = double_mask[:ACC_ratio]
Acc_labels = original[:ACC_ratio]
#test
test_docs_double = double_mask[ACC_ratio:test_ratio]
test_labels = original[ACC_ratio:test_ratio]
#train
train_double = double_mask[test_ratio:]
original_texts = original[test_ratio:]

In [None]:
print(len(Acc_docs_double),len(Acc_labels))
print(len(test_labels))
print(len(original_texts))
print(len(train_double))

20 20
20
167
167


In [None]:
train_docs_double = train_double
train_labels = original_texts


print("訓練データ")
print(len(train_docs_double),len(train_labels))
print("評価データ")
print(len(Acc_docs_double),len(Acc_labels))
print("テストデータ")
print(len(test_docs_double),len(test_labels))

訓練データ
167 167
評価データ
20 20
テストデータ
20 20


## 形態素解析

In [None]:
encodings = tokenizer(train_docs_double, return_tensors='pt', padding=True, truncation=True, max_length=64)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
encodings = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True, max_length=64)
labels = encodings['input_ids']
print(encodings)

{'input_ids': tensor([[    2,  1414, 17703,  ...,     0,     0,     0],
        [    2,  3407,   524,  ...,     0,     0,     0],
        [    2,   147, 29294,  ...,     0,     0,     0],
        ...,
        [    2, 18335, 28467,  ...,     0,     0,     0],
        [    2,  3337,  2833,  ...,     0,     0,     0],
        [    2,  8616,  1801,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [None]:
encodings = tokenizer(Acc_docs_double, return_tensors='pt', padding=True, truncation=True, max_length=64)
acc_input_ids = encodings['input_ids']
acc_attention_mask = encodings['attention_mask']
encodings = tokenizer(Acc_labels, return_tensors='pt', padding=True, truncation=True, max_length=64)
acc_labels = encodings['input_ids']

## 関数定義

In [None]:
"""
リストが一致しているかしているかboolで返す
"""
def MatchList(list1,list2):
  for i in range(len(list1)):
    if list1[i] != list2[i]:
      return False
  return True

"""
model:model
input:入力データ
tokenizer:トークナイザ
label:ラベル
"""
def predict(model,tokenizer,input,label):
  model.eval()
  encodings = tokenizer(input, return_tensors='pt', padding=True, truncation=True, max_length=64)
  input_ids = encodings['input_ids']
  encodings = tokenizer(label, return_tensors='pt', padding=True, truncation=True, max_length=64)
  label_ids = encodings['input_ids']
  count = 0
  bunbo = 0
  batch = 16
  num = len(input_ids)//batch + 1
  correctAnswer = [] #正解したのを保存するリスト
  All_return_result = []
  for b in range(num):#bacthに分けて学習するためnumの回数繰り返す
    section = batch*b
    section2 = section+batch
    batch_input_ids = input_ids[section:section2]
    if(b == num-1):
      batch_input_ids = input_ids[section:]

    with torch.no_grad():
      outputs = model(batch_input_ids)

    masked_position = (batch_input_ids.squeeze() == tokenizer.mask_token_id).nonzero() #これでマスク部分の抽出ができるらしい
    last_hidden_state = outputs[0].squeeze()#出力

    masked_pos = []
    [masked_pos.append([])for _ in range(masked_position[-1][0]+1)]

    for i in masked_position:
        masked_pos[i[0].item()].append(i[1].item())


    topk = 1
    for index,mask_index in enumerate(masked_pos):
      result=[]
      answer=[]
      
      for i in mask_index:
        mask_hidden_state = last_hidden_state[index][i] # 詳細結果の参照
        idx = torch.topk(mask_hidden_state, k=topk, dim=0)[1] # 予測結果の上位を出力
        words = [tokenizer.decode(i.item()).strip() for i in idx]
        result.append(words[0])
        answer_words = tokenizer.decode(label_ids[index+section][i].item()).strip()
        answer.append(answer_words)

      result_docs = input[index+section]
      for i in result:
        i=i.replace(" ","")
        result_docs = result_docs.replace('[MASK]',i,1)
      if(MatchList(result,answer)):
        count += 1
        correctAnswer.append(result_docs.replace("\n",""))
      # print("------------")
      # print("maskに対する予測",result)
      # print("答え答え　　　　 ",answer)
      # print("予測した文章　:"+result_docs.replace("\n",""))
      # print("元の文章　　　:"+label[index+section].replace("\n",""))
      return_result = [result,answer,result_docs.replace("\n",""),label[index+section].replace("\n","")]
      All_return_result.append(return_result)
      bunbo += 1
  print(str(count)+"/"+str(bunbo))
  # for i in correctAnswer:
  #   print(i)
  score = count/bunbo
  return {"分数":str(count)+"/"+str(bunbo),"スコア":score,"正解リスト":correctAnswer,"予測":All_return_result}

## ファインチューニング前に一度結果を確認

In [1]:
test_result = predict(model,tokenizer,test_docs_double,test_labels)

NameError: ignored

In [None]:
test_result["分数"]

'1/20'

# 実験

## ファインチューニング

### 定義

In [None]:
import numpy as np
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

from torch import nn
# 適当なモデル
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.Linear = nn.Linear(1, 1)
    
    def forward(self, x):
        pass

### ファインチューニング

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
early_stopping = EarlyStopping(patience=3)
epoch = 100
batch = 16
num = len(input_ids)//batch
loss_list = []
val_loss = []
test_acc = []
val_acc = []
train_acc = []
result_list = [[],[],[]]
for i in range(epoch):#epochの数だけ繰り返す
  print("-----------epoch"+str(i+1)+"----------")
  test_result = test_acc.append(predict(model,tokenizer,test_docs_double,test_labels))
  train_result = train_acc.append(predict(model,tokenizer,train_docs_double,train_labels))
  val_result = val_acc.append(predict(model,tokenizer,Acc_docs_double,Acc_labels))
  result_list[0].append(test_result)
  result_list[1].append(train_result)
  result_list[2].append(val_result)
  for b in range(num):#bacthに分けて学習するためnumの回数繰り返す
    c=b+1
    outputs = model(input_ids[batch*b:batch*c], attention_mask=attention_mask[batch*b:batch*c], labels=labels[batch*b:batch*c])
    loss = outputs.loss
    loss_list.append(loss.item())
    print(loss)
    loss.backward()
    optimizer.step()
    


  with torch.no_grad():
    outputs = model(acc_input_ids,attention_mask=acc_attention_mask,labels=acc_labels)
  loss = outputs.loss
  val_loss.append(loss.item())
  print("valののloss")
  print(loss)

  early_stopping(loss, model)
  if early_stopping.early_stop:
    print("Early Stopping")
    break

    
      


# 結果

### 結果の保存

In [None]:
# loss_list
# val_loss
# test_acc


filepass='/content/drive/MyDrive/実験/result/パターンA/'
filename="train_loss.txt"
with open(filepass+filename, mode='w') as f:
    for i in loss_list:
        f.write(str(i)+"\n")

filepass='/content/drive/MyDrive/実験/result/パターンA/'
filename="val_loss.txt"
with open(filepass+filename, mode='w') as f:
    for i in val_loss:
        f.write(str(i)+"\n")

In [None]:
import os
filepass='/content/drive/MyDrive/実験/result/パターンA/テストデータ/epoch'
filename="test_score"
filename2="test_predict"
for i in range(len(test_acc)):
  save_pass = filepass+str(i+1)
  os.makedirs(save_pass, exist_ok=True)
  with open(save_pass+"/"+filename+str(i+1)+".txt", mode='w') as f:
      f.write(test_acc[i]["分数"]+","+str(test_acc[i]["スコア"])+"\n")
      for j in test_acc[i]["正解リスト"]:
        f.write(j+"\n")
  
  with open(save_pass+"/"+filename2+str(i+1)+".txt", mode='w') as f:
      for j in test_acc[i]["予測"]:
        f.write("予測結果："+j[0][0]+","+j[0][1]+"\n") 
        f.write("答え　　："+j[1][0]+","+j[1][1]+"\n") 
        f.write("予測文章："+j[2]+"\n") 
        f.write("元の文章："+j[3]+"\n")  


In [None]:
# val_acc
# train_acc

filepass='/content/drive/MyDrive/実験/result/パターンA/検証データ/epoch'
filename="val_score"
filename2="val_predict"
for i in range(len(val_acc)):
  save_pass = filepass+str(i+1)
  os.makedirs(save_pass, exist_ok=True)
  with open(save_pass+"/"+filename+str(i+1)+".txt", mode='w') as f:
      f.write(val_acc[i]["分数"]+","+str(val_acc[i]["スコア"])+"\n")
      for j in val_acc[i]["正解リスト"]:
        f.write(j+"\n")
  
  with open(save_pass+"/"+filename2+str(i+1)+".txt", mode='w') as f:
      for j in val_acc[i]["予測"]:
        f.write("予測結果："+j[0][0]+","+j[0][1]+"\n") 
        f.write("答え　　："+j[1][0]+","+j[1][1]+"\n") 
        f.write("予測文章："+j[2]+"\n") 
        f.write("元の文章："+j[3]+"\n")

In [None]:
filepass='/content/drive/MyDrive/実験/result/パターンA/訓練データ/epoch'
filename="train_score"
filename2="train_predict"
for i in range(len(train_acc)):
  save_pass = filepass+str(i+1)
  os.makedirs(save_pass, exist_ok=True)
  with open(save_pass+"/"+filename+str(i+1)+".txt", mode='w') as f:
      f.write(train_acc[i]["分数"]+","+str(train_acc[i]["スコア"])+"\n")
      for j in train_acc[i]["正解リスト"]:
        f.write(j+"\n")
  
  with open(save_pass+"/"+filename2+str(i+1)+".txt", mode='w') as f:
      for j in train_acc[i]["予測"]:
        try:
          f.write("予測結果："+j[0][0]+","+j[0][1]+"\n") 
          f.write("答え　　："+j[1][0]+","+j[1][1]+"\n")
        except:
          continue
        f.write("予測文章："+j[2]+"\n") 
        f.write("元の文章："+j[3]+"\n")

## 結果の確認

In [None]:
test_acc_score = []
val_acc_score = []
train_acc_score = []
for i in train_acc:
  train_acc_score.append(i["スコア"])

for i in val_acc:
  val_acc_score.append(i["スコア"])

for i in test_acc:
  test_acc_score.append(i["スコア"])

In [None]:
#lossのグラフ化グラフ化
import matplotlib as mpl
import matplotlib.pyplot as plt

a=len(loss_list)/len(val_loss)
x = [i for i in range(len(loss_list))]
y = loss_list
x2 = [(i+1)*a for i in range(len(val_loss))]
y2 = val_loss
y3 = train_acc_score
y4 = test_acc_score
y5 = val_acc_score
# plt.plot(x,y,color="b")
# plt.plot(x2,y2,color="r") 

plt.plot(x,y)
plt.plot(x2,y2)
plt.show()



plt.plot(x2,y3,color="g",label="train")
plt.plot(x2,y4,color="c",label="test")
plt.plot(x2,y5,color="m",label="val")
plt.legend()
plt.show()

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax1.plot(x,y)
ax1.plot(x2,y2)
ax2.plot(x2,y3,color="g")
ax2.plot(x2,y4,color="c")
ax2.plot(x2,y5,color="m")
plt.ylim(0,1)
ax1.set_xlabel('step')
ax1.set_ylabel('loss')
ax2.set_ylabel('acc')
#plt.plot(x,y2,color="k") # 点列(x,y)を黒線で繋いだプロット
plt.show() # プロットを表示