# 動作確認：早稲田大学RoBERTa

In [17]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from pyknp import Juman
MODEL_NAME = "nlp-waseda/roberta-base-japanese"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)       # トークナイザーのロード
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)    # モデルのロード

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at nlp-waseda/roberta-base-japanese and are newly initialized: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## JUMAN++を使わない場合のトークナイズ


In [None]:
sentence = "仲よくして下されば嬉しいです。"
tokenize = tokenizer.tokenize(sentence)
tokenize

In [None]:
sentence = "おはようございます"
tokenize = tokenizer.tokenize(sentence)
tokenize

In [None]:
# 単語の追加
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)       # トークナイザーのロード
tokenizer.add_tokens("ございます")
sentence = "おはようございます"
tokenize = tokenizer.tokenize(sentence)
print("トークナイズ結果：",tokenize)
print("エンコード結果：",tokenizer.encode(sentence))

In [None]:
# 単語の追加
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)       # トークナイザーのロード
tokenizer.add_tokens("いま")
sentence = "おはようございます"
tokenize = tokenizer.tokenize(sentence)
tokenize

In [None]:
# 複数文のトークナイズ
text_list = ["私は、その男の写真を三葉、見たことがある。",
             "一葉は、その男の、幼年時代、とでも言うべきであろうか",
             "十歳前後かと推定される頃の写真であって、その子供が大勢の女のひとに取りかこまれ",
             "（それは、その子供の姉たち、妹たち、それから、従姉妹いとこたちかと想像される）",
             "庭園の池のほとりに、荒い縞の袴はかまをはいて立ち"]

encodings = tokenizer(
    text_list,
    add_special_tokens=True,
    return_tensors="pt", 
    max_length= 128,
    truncation = True,
    padding="max_length",
    ).input_ids
encodings

In [None]:
# 分割文字のテキストファイルを読み込む
splittable_characters_list = []
split_kanji_text_path ="../content/corpus/split-corpus/joyo-kanji-split.txt"
with open(split_kanji_text_path,"r",encoding="utf-8") as f:
    for line in f:
        splittable_characters_list.append(line[0])
        splittable_characters_list.append(line[1:3])

# 単語がもともとあったら削除
for sc in splittable_characters_list:
    if tokenizer.vocab.get(sc):
        del tokenizer.vocab[sc]

# もともとある要らない単語をすべて削除してから追加する
for sc in splittable_characters_list:
    tokenizer.add_tokens(sc)    # 単語を追加

In [None]:
model.resize_token_embeddings(len(tokenizer))   # 追加したトークンに合わせてサイズを変更

### JUMAN++を利用した場合

In [None]:
sentence = "私は犬が好きです。"
jumanpp = Juman(command="jumanpp_v2",
                jumanpp=True,
                option="--config=C:\jumanpp\libexec\jumandic.conf")   # JUMAN++を使う
result = jumanpp.analysis(sentence) # 文章を読み込む
result = [mrph.midasi for mrph in result.mrph_list()]
print("分かち書き結果：",result)
result = " ".join(result)   # 分かち書き
encoding = tokenizer(sentence, return_tensors='pt').input_ids 

print("分かち書き後のトークナイズ結果：",tokenizer.tokenize(result))
print("エンコード結果：",encoding)

In [None]:
sentence = "おはようございます"
jumanpp = Juman(command="jumanpp_v2",
                jumanpp=True,
                option="--config=C:\jumanpp\libexec\jumandic.conf")   # JUMAN++を使う
result = jumanpp.analysis(sentence) # 文章を読み込む
result = [mrph.midasi for mrph in result.mrph_list()]
result = " ".join(result)   # 分かち書き
print(result)
print(tokenizer.tokenize(result))
encoding = tokenizer(result, return_tensors='pt').input_ids 
encoding

> トークナイズ結果： ['▁おはよう', 'ございます']  
> エンコード結果： [2, 19283, 32000, 3]

トークナイズ後の結果が異なることがわかる

In [None]:
sentence = "自分を大七刀にしてくれる"
jumanpp = Juman(command="jumanpp_v2",
                jumanpp=True,
                option="--config=C:\jumanpp\libexec\jumandic.conf")   # JUMAN++を使う
result = jumanpp.analysis(sentence) # 文章を読み込む
result = [mrph.midasi for mrph in result.mrph_list()]
print(result)
result = " ".join(result)   # 分かち書き
print(result)
print(tokenizer.tokenize(result))
encoding = tokenizer(result, return_tensors='pt').input_ids 
encoding

In [None]:
sentence = "紹"
print(tokenizer.tokenize(sentence))

In [None]:
tokenizer.decode([4835])

## SentencePieceの仕様上アンダースコアが文字の前に入ることを確かめる

In [None]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.Load("../content/model/nlp-waseda/roberta-base-japanese/spiece.model")

sp.EncodeAsPieces(result)

同じにならないのはなぜ？

# 東北大BERT

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
MODEL_NAME = "../content/model/bert-base-japanese-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)       # トークナイザーのロード
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)    # モデルのロード

In [None]:
# 分割文字のテキストファイルを読み込む
splittable_characters_list = []
split_kanji_text_path ="../content/corpus/split-corpus/joyo-kanji-split.txt"
with open(split_kanji_text_path,"r",encoding="utf-8") as f:
    for line in f:
        splittable_characters_list.append(line[0])
        splittable_characters_list.append(line[1:3])

# 単語がもともとあったら削除
for sc in splittable_characters_list:
    if tokenizer.vocab.get(sc):
        del tokenizer.vocab[sc]

# もともとある要らない単語をすべて削除してから追加する
for sc in splittable_characters_list:
    tokenizer.add_tokens(sc)    # 単語を追加

text = "自分を大七刀にしてくれる"
encoding = tokenizer.tokenize(text)
encoding

In [2]:
text = "自分を大七刀にしてくれる"
encoding = tokenizer.tokenize(text)
encoding

['自分', 'を', '大', '七', '刀', 'に', 'し', 'て', 'くれる']

In [20]:
import pandas as pd

sns_path = "../content/data/result/mask-model-tweet.tsv"
save_path = "../content/data/tmp/sample-tweet-test.tsv"
df = pd.read_table(sns_path)
# display(df)
df.columns
df = df.drop([" is_correct "," correct_score "," incorrect_score "],axis=1)
df = df.rename(columns={"input ":"input_ids",
                        " output ":"labels",
                        " choices ":"option",
                        " answer ":"answer"})

df.to_csv(save_path,sep="\t",index=False)

## 動作確認

In [7]:
import ast
with open("kakunin_preds.csv","r") as f:
    line = f.read()
    l_preds = ast.literal_eval(line)

with open("kakunin_labels.csv","r") as f:
    line = f.read()
    l_labels = ast.literal_eval(line)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

MODEL_NAME = "../content/model/bert-base-japanese-v2/target_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)       # トークナイザーのロード
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)    # モデルのロード

In [None]:
for idx in range(len(l_preds)):
    if l_preds[idx] != l_labels[idx]:
        print("l_preds:",tokenizer.decode(l_preds[idx]))
        print("l_labels:",tokenizer.decode(l_labels[idx]))
        print("------")
        

In [3]:
tokenizer.decode([    2,  1762,   896,  2868, 32598,   883, 10866,  5856, 10584, 33003,
          862, 14608,   875,   933,   892, 11303,  5777,   829,     3, ])

'[CLS] 女 の 気 才寺 ち 考えれ ない 奴 が 恋愛 す ん な バーカ 。 [SEP]'

In [5]:
tokenizer.decode([33003])

'奴'

In [12]:
import openpyxl

filepath = "../reports/Book1.xlsx"
book = openpyxl.load_workbook(filepath)
ws = book["Sheet1"]

In [14]:
ws.append(["aa","bb"])
book.save(filepath)

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
MODEL_NAME = "../models/bert-base-japanese-v2/sample_model"
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)    # モデルのロード

In [None]:
from pathlib import Path
text_filepath = Path("../data/raw/article/CC-100_ja.txt")
count = 0
with open(text_filepath,"r",encoding="utf-8") as f:
    for line in f:
        print(line.strip())
        count += 1
        if count==100:
            break

In [57]:
import re
import mojimoji

input_filepath = "../data/raw/article/mainichi_all.txt"
sample_output = "../data/interim/article/normed_mainichi_all.txt"
output = [] 
N = 128 # 閾値

"""文章を区切る"""
with open(input_filepath,"r",encoding="utf-8") as f,\
    open(sample_output,"w",encoding="utf-8") as sf:
    for line in f:
        # split_sentence = re.findall("[^。]+。?",line)
        split_sentences = re.split("(?<=。)",line)
        del split_sentences[-1]     # 改行文字を削除
        
        tmp = None
        for sentence in split_sentences:
            """正規化を書き込む"""
            normed_sentence = mojimoji.zen_to_han(sentence,kana=False)
            
            if tmp is None:
                """tmpが初期値の場合"""
                tmp = normed_sentence
            elif len(tmp) + len(normed_sentence) >= N:
                """分割して書き込み、tmpを初期値へ"""
                output.append(tmp)
                output.append(normed_sentence)
                tmp = None
            elif len(tmp) + len(normed_sentence) < N:
                """短い文は統合"""
                tmp = tmp + normed_sentence
        
        if tmp is not None:
            output.append(tmp)
    
    # 改行コードを挿入
    output = map(lambda x:x+"\n",output)
    sf.writelines(output)

In [4]:
sample_list=[1,2,3,4]

def add1(sample):
    sample = sample+1
    return sample
for sample in sample_list:
    a = add1(sample)
    print(a)


2
3
4
5


In [7]:
import random
random.seed(1234)
random.random()

0.9664535356921388

# BERT model

In [13]:
from transformers import AutoModelForMaskedLM,AutoTokenizer

model_name = "cl-tohoku/bert-base-japanese-v2"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 174/174 [00:00<00:00, 87.0kB/s]
Downloading: 100%|██████████| 230k/230k [00:00<00:00, 467kB/s] 


In [16]:
# [UNK]:1
def read_splitchar(vocab_filepath):
    # 分割文字のテキストファイルを読み込む
    splittable_characters_list = []
    with open(vocab_filepath,"r",encoding="utf-8") as f:
        for line in f:
            splittable_characters_list.append(line[0])
            
    return splittable_characters_list

[2, 1, 3]