<a href="https://colab.research.google.com/github/m37335/kanagawa-exam/blob/master/textbook_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ファインチューニング用ファイルの作成**
中学校採択教科書を用いて、ファインチューニング用のデータを作る

## **ライブラリのインストールとインポート**

In [None]:
!pip install transformers
!pip install stanza

In [2]:
# pytorch
import torch
from transformers import BertTokenizer, BertForMaskedLM
# stanza
import stanza
stanza.download('en') # download English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 34.4MB/s]                    
2021-05-05 14:19:04 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:14<00:00, 5.54MB/s]
2021-05-05 14:20:24 INFO: Finished downloading models and saved to /root/stanza_resources.


## **教科書本文のToken化とLemma化**

In [3]:
import pandas as pd
allsentenceInTextbook_df = pd.read_csv('https://raw.githubusercontent.com/m37335/kanagawa-exam/master/data/%20NH_textbook-sentence.csv')

In [4]:
allsentenceInTextbook_df

Unnamed: 0,id,textBook,grade,articleId,Page,sentenceId,sentence
0,0,NH,1,1,P4,1,"Good morning, Takashi."
1,1,NH,1,1,P4,2,"Good morning, Jill."
2,2,NH,1,1,P4,3,How are you?
3,3,NH,1,1,P4,4,"I'm fine, thank you."
4,4,NH,1,1,P4,5,And you?
...,...,...,...,...,...,...,...
1551,1551,NH,3,22,P111,38,"He used his photographs to share it with us,"
1552,1552,NH,3,22,P111,39,and to show us the importance of life on the E...
1553,1553,NH,3,22,P111,40,Michio’s own life was changed because of one p...
1554,1554,NH,3,22,P111,41,Perhaps his photographs will also change someo...


In [5]:
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('”', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('“', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('"', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('[', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace(']', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('□', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('―', '')
allsentenceInTextbook_df.sentence = allsentenceInTextbook_df.sentence.str.replace('\'\'', ' \'')

### **センテンスのTokenize化**

In [6]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', use_gpu=True, tokenize_pretokenized=False)
stanza_token = []

for df_sentence in allsentenceInTextbook_df.sentence:
  doc = nlp(df_sentence)
  for sentence in doc.sentences:
    tmp_token = []
    for word in sentence.words:
      tmp_token.append(word.text)
    tmp_token.insert(0, "[CLS]")
    tmp_token.append("[SEP]")
    
    stanza_token.append(tmp_token)

#print(stanza_token)
stanza_token_df = pd.DataFrame(stanza_token)

2021-05-05 14:25:15 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-05-05 14:25:15 INFO: Use device: gpu
2021-05-05 14:25:15 INFO: Loading: tokenize
2021-05-05 14:25:26 INFO: Loading: pos
2021-05-05 14:25:26 INFO: Loading: lemma
2021-05-05 14:25:26 INFO: Done loading processors!


In [7]:
stanza_token_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,[CLS],Good,morning,",",Takashi,.,[SEP],,,,,,,,,,,,,,,,,,,
1,[CLS],Good,morning,",",Jill,.,[SEP],,,,,,,,,,,,,,,,,,,
2,[CLS],How,are,you,?,[SEP],,,,,,,,,,,,,,,,,,,,
3,[CLS],I,'m,fine,",",thank,you,.,[SEP],,,,,,,,,,,,,,,,,
4,[CLS],And,you,?,[SEP],,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,[CLS],He,used,his,photographs,to,share,it,with,us,",",[SEP],,,,,,,,,,,,,,
1552,[CLS],and,to,show,us,the,importance,of,life,on,the,Earth,.,[SEP],,,,,,,,,,,,
1553,[CLS],Michio,’s,own,life,was,changed,because,of,one,photograph,.,[SEP],,,,,,,,,,,,,
1554,[CLS],Perhaps,his,photographs,will,also,change,someone,’s,life,.,[SEP],,,,,,,,,,,,,,


In [14]:
allsentenceInTextbook_token_df_concat = pd.concat([allsentenceInTextbook_df, stanza_token_df], axis=1)

In [61]:
from google.colab import files
filename =  "df_concat-check.csv"
df_concat.to_csv(filename, encoding = 'utf-8-sig') 
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
df_concat

Unnamed: 0,id,textBook,grade,articleId,Page,sentenceId,sentence,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,0,NH,1,1,P4,1,"Good morning, Takashi.",[CLS],Good,morning,",",Takashi,.,[SEP],,,,,,,,,,,,,,,,,,,
1,1,NH,1,1,P4,2,"Good morning, Jill.",[CLS],Good,morning,",",Jill,.,[SEP],,,,,,,,,,,,,,,,,,,
2,2,NH,1,1,P4,3,How are you?,[CLS],How,are,you,?,[SEP],,,,,,,,,,,,,,,,,,,,
3,3,NH,1,1,P4,4,"I'm fine, thank you.",[CLS],I,'m,fine,",",thank,you,.,[SEP],,,,,,,,,,,,,,,,,
4,4,NH,1,1,P4,5,And you?,[CLS],And,you,?,[SEP],,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,1551,NH,3,22,P111,38,"He used his photographs to share it with us,",[CLS],He,used,his,photographs,to,share,it,with,us,",",[SEP],,,,,,,,,,,,,,
1552,1552,NH,3,22,P111,39,and to show us the importance of life on the E...,[CLS],and,to,show,us,the,importance,of,life,on,the,Earth,.,[SEP],,,,,,,,,,,,
1553,1553,NH,3,22,P111,40,Michio’s own life was changed because of one p...,[CLS],Michio,’s,own,life,was,changed,because,of,one,photograph,.,[SEP],,,,,,,,,,,,,
1554,1554,NH,3,22,P111,41,Perhaps his photographs will also change someo...,[CLS],Perhaps,his,photographs,will,also,change,someone,’s,life,.,[SEP],,,,,,,,,,,,,,


### **単語のlemma化**

In [10]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', use_gpu=True, tokenize_pretokenized=False)
stanza_token_lemma = []

for df_sentence in allsentenceInTextbook_df.sentence:
  doc = nlp(df_sentence)
  for sentence in doc.sentences:
    tmp_token = []
    for word in sentence.words:
      tmp_token.append(word.lemma)
    tmp_token.insert(0, "[CLS]")
    tmp_token.append("[SEP]")
    
    stanza_token_lemma.append(tmp_token)

#print(stanza_token_lemma)
stanza_token_lemma_df = pd.DataFrame(stanza_token_lemma)

2021-05-05 14:28:06 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-05-05 14:28:06 INFO: Use device: gpu
2021-05-05 14:28:06 INFO: Loading: tokenize
2021-05-05 14:28:06 INFO: Loading: pos
2021-05-05 14:28:06 INFO: Loading: lemma
2021-05-05 14:28:06 INFO: Done loading processors!


In [11]:
stanza_token_lemma_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,[CLS],good,morning,",",Takashi,.,[SEP],,,,,,,,,,,,,,,,,,,
1,[CLS],good,morning,",",Jill,.,[SEP],,,,,,,,,,,,,,,,,,,
2,[CLS],how,be,you,?,[SEP],,,,,,,,,,,,,,,,,,,,
3,[CLS],I,be,fine,",",thank,you,.,[SEP],,,,,,,,,,,,,,,,,
4,[CLS],and,you,?,[SEP],,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,[CLS],he,use,he,photograph,to,share,it,with,we,",",[SEP],,,,,,,,,,,,,,
1552,[CLS],and,to,show,we,the,importance,of,life,on,the,Earth,.,[SEP],,,,,,,,,,,,
1553,[CLS],Michio,'s,own,life,be,change,because,of,one,photograph,.,[SEP],,,,,,,,,,,,,
1554,[CLS],perhaps,he,photograph,will,also,change,someone,'s,life,.,[SEP],,,,,,,,,,,,,,


In [12]:
lemma_df_concat = pd.concat([allsentenceInTextbook_df, stanza_token_lemma_df], axis=1)

In [13]:
lemma_df_concat

Unnamed: 0,id,textBook,grade,articleId,Page,sentenceId,sentence,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,0,NH,1,1,P4,1,"Good morning, Takashi.",[CLS],good,morning,",",Takashi,.,[SEP],,,,,,,,,,,,,,,,,,,
1,1,NH,1,1,P4,2,"Good morning, Jill.",[CLS],good,morning,",",Jill,.,[SEP],,,,,,,,,,,,,,,,,,,
2,2,NH,1,1,P4,3,How are you?,[CLS],how,be,you,?,[SEP],,,,,,,,,,,,,,,,,,,,
3,3,NH,1,1,P4,4,"I'm fine, thank you.",[CLS],I,be,fine,",",thank,you,.,[SEP],,,,,,,,,,,,,,,,,
4,4,NH,1,1,P4,5,And you?,[CLS],and,you,?,[SEP],,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,1551,NH,3,22,P111,38,"He used his photographs to share it with us,",[CLS],he,use,he,photograph,to,share,it,with,we,",",[SEP],,,,,,,,,,,,,,
1552,1552,NH,3,22,P111,39,and to show us the importance of life on the E...,[CLS],and,to,show,we,the,importance,of,life,on,the,Earth,.,[SEP],,,,,,,,,,,,
1553,1553,NH,3,22,P111,40,Michio’s own life was changed because of one p...,[CLS],Michio,'s,own,life,be,change,because,of,one,photograph,.,[SEP],,,,,,,,,,,,,
1554,1554,NH,3,22,P111,41,Perhaps his photographs will also change someo...,[CLS],perhaps,he,photograph,will,also,change,someone,'s,life,.,[SEP],,,,,,,,,,,,,,


### **ここまでで作成したもの**
・allsentenceInTextbook_df  
→教科書本文のデータフレーム_元データ  
・stanza_token_df  
→StanzaでToken化した単語のデータフレーム    
・allsentenceInTextbook_token_df_concat  
→教科書本文データとToken化した単語のデータフレーム
  
・lemma_df_concat  
→教科書本文データとlemma化した単語のデータフレーム  
・stanza_token_lemma_df  
→Stanzaでlemma化した単語のデータフレーム  

## **教科書から作成した出題頻度の高い単語リスト**

In [16]:
f_wordlist_df = pd.read_csv('https://raw.githubusercontent.com/m37335/kanagawa-exam/master/data/f_wordlist.csv')

In [21]:
print(f_wordlist_df.head())
print(len(f_wordlist_df))

    No  word  level  number of character
0  139  area      4                    4
1  209  band      4                    4
2  244  beef      4                    4
3  260  belt      4                    4
4  308  body      4                    4
851


### 単語のlemma化

In [22]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', use_gpu=True, tokenize_pretokenized=False)
f_wordlist_lemma = []

for df_sentence in f_wordlist_df.word:
  doc = nlp(df_sentence)
  for sentence in doc.sentences:
    tmp_token = []
    for word in sentence.words:
      tmp_token.append(word.lemma)
    
    f_wordlist_lemma.append(tmp_token)

#print(stanza_token_lemma)
f_wordlist_lemma_df = pd.DataFrame(f_wordlist_lemma)

2021-05-05 14:39:18 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-05-05 14:39:18 INFO: Use device: gpu
2021-05-05 14:39:18 INFO: Loading: tokenize
2021-05-05 14:39:18 INFO: Loading: pos
2021-05-05 14:39:19 INFO: Loading: lemma
2021-05-05 14:39:19 INFO: Done loading processors!


In [23]:
f_wordlist_lemma_df

Unnamed: 0,0,1,2,3
0,area,,,
1,band,,,
2,beef,,,
3,belt,,,
4,body,,,
...,...,...,...,...
846,grandmother,,,
847,interesting,,,
848,traditional,,,
849,international,,,


In [24]:
# CSVで出力し、ダウンロード
from google.colab import files
filename =  "f_wordlist_lemma_df-check.csv"
f_wordlist_lemma_df.to_csv(filename, encoding = 'utf-8-sig') 
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
f_wordlist_lemma_df_concat = pd.concat([f_wordlist_df, f_wordlist_lemma_df], axis=1)

In [27]:
f_wordlist_lemma_df_concat

Unnamed: 0,No,word,level,number of character,0,1,2,3
0,139,area,4,4,area,,,
1,209,band,4,4,band,,,
2,244,beef,4,4,beef,,,
3,260,belt,4,4,belt,,,
4,308,body,4,4,body,,,
...,...,...,...,...,...,...,...,...
846,1138,grandmother,6,11,grandmother,,,
847,1374,interesting,6,11,interesting,,,
848,2811,traditional,6,11,traditional,,,
849,1375,international,6,13,international,,,


## **教科書本文のMASK化**

### **教科書本文内の出題頻度が高い単語を検索し、IndexIDを取得**

In [28]:
words_list = stanza_token_lemma_df.values.tolist()
f_word_lemma_list = f_wordlist_lemma_df_concat[0].values.tolist()
f_word_list = f_wordlist_lemma_df_concat.word.values.tolist()

sentence_f_word_list = []

# 本文中から出題頻度の高い単語を検索し、インデックス番号を取得する
for f_word, f_word_lemma in zip(f_word_list, f_word_lemma_list):
  # print(f_word)
  # print(f_word_lemma)
  for cnt, list in enumerate(words_list, start=0):
    tmp_list = []
    try:
      tmp_list.append(cnt)
      tmp_list.append(f_word)
      tmp_list.append(f_word_lemma)
      tmp_list.append(list.index(f_word_lemma))
      sentence_f_word_list.append(tmp_list)
    except ValueError:
      pass

In [33]:
# 要素の件数
print("要素数は" + str(len(sentence_f_word_list)) + "件です。")
# 上位10件のみ表示
for i in range(10):
  list_item = sentence_f_word_list[i]
  print(list_item)
print("...")

要素数は6421件です。
[92, 'band', 'band', 7]
[1255, 'band', 'band', 5]
[1088, 'beef', 'beef', 5]
[111, 'belt', 'belt', 5]
[442, 'body', 'body', 13]
[1273, 'body', 'body', 7]
[1271, 'bomb', 'bomb', 10]
[486, 'case', 'case', 7]
[1303, 'case', 'case', 2]
[1306, 'case', 'case', 2]
...


In [34]:
# DataFrameに変換
sentence_f_word_df = pd.DataFrame(sentence_f_word_list,
                                  columns=['id', 'word', 'lemma', 'mask_id'])

In [35]:
sentence_f_word_df

Unnamed: 0,id,word,lemma,mask_id
0,92,band,band,7
1,1255,band,band,5
2,1088,beef,beef,5
3,111,belt,belt,5
4,442,body,body,13
...,...,...,...,...
6416,297,traditional,traditional,7
6417,1073,traditional,traditional,8
6418,1233,traditional,traditional,8
6419,1348,traditional,traditional,3


### **単語のMASK化**

#### **新たにデータフレームを作る**

In [81]:
# 新たに出題頻度の高い単語を含んだセンテンスとインデックスIDを入れる為の空のデータフレームを作る
mask_model_df = pd.DataFrame(columns=['id', 'textBook', 'grade', 'articleId', 'Page', 'sentenceId', 'sentence', 'mask_id'])

In [83]:
# データフレームからセンテンスIDとマスクIDをリストに変換
id_list = sentence_f_word_df.id
maskId_list = sentence_f_word_df.mask_id

for id, maskId in zip(id_list, maskId_list):
  tmp_list = allsentenceInTextbook_df.loc[allsentenceInTextbook_df['id'] == id].values.tolist()
  tmp_list[0].append(maskId)
  # print(tmp_list[0])
  tmp_series = pd.Series(tmp_list[0], index=['id', 'textBook', 'grade', 'articleId', 'Page', 'sentenceId', 'sentence', 'mask_id'])
  # print(tmp_series)
  mask_model_df = mask_model_df.append(tmp_series, ignore_index=True)

In [84]:
mask_model_df

Unnamed: 0,id,textBook,grade,articleId,Page,sentenceId,sentence,mask_id
0,92,NH,1,6,P38,7,I play the guitar in a band.,7
1,1255,NH,3,11,P53,16,We could see a band of bright green lights in ...,5
2,1088,NH,3,3,P16,13,It's made from beef.,5
3,111,NH,1,6,P42,26,I want a black belt.,5
4,442,NH,1,32,P128,12,"Take off your clothes, and Put salt and vinega...",13
...,...,...,...,...,...,...,...,...
6416,297,NH,1,21,P96,10,"They’re watching henmen, a traditional Chinese...",7
6417,1073,NH,3,2,P12,30,I think manga and anime will be traditional Ja...,8
6418,1233,NH,3,10,P50,2,Kyoto is an old city with many traditional bui...,8
6419,1348,NH,3,15,P70,5,Feel the traditional rhythms created by techno...,3


In [None]:
# CSVで出力し、ダウンロード
from google.colab import files
filename =  "mask_model_df-check.csv"
mask_model_df.to_csv(filename, encoding = 'utf-8-sig') 
files.download(filename)

### **MASKに変換**

In [101]:
def textbook_mask(sentences, mask_id):
  tokens = []

  doc = nlp(sentences)
  for sentence in doc.sentences:
    for word in sentence.words:
      tokens.append(word.text)
  tokens = ['[CLS]'] + tokens + ['[SEP]']
  masked_word = tokens[mask_id]
  tokens[mask_id] = '[MASK]'
  # print(masked_word)
  # print(tokens)
  return tokens,masked_word

In [116]:
textbook_maskedword_df = pd.DataFrame()
masked_word_list = []

sentence_list = mask_model_df.sentence
maskId_list = mask_model_df.mask_id

for sentences, mask_id in zip(sentence_list, maskId_list):
  tokens, masked_word = textbook_mask(sentences, mask_id)
  # return tokens&masked_word
  tmp_list = tokens
  #Seriesに変換
  tmp_series = pd.Series(tmp_list)

  masked_word_list.append(masked_word)
  
  textbook_maskedword_df = textbook_maskedword_df.append(tmp_series, ignore_index=True)
textbook_maskedword_df.insert(0, 'masked_Word', masked_word_list)

### **MASK化したデータをデータフレームに変換**

In [117]:
textbook_maskedword_df

Unnamed: 0,masked_Word,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,band,[CLS],I,play,the,guitar,in,a,[MASK],.,[SEP],,,,,,,,,,,,,,,,
1,band,[CLS],We,could,see,a,[MASK],of,bright,green,lights,in,the,sky,.,[SEP],,,,,,,,,,,
2,beef,[CLS],It,'s,made,from,[MASK],.,[SEP],,,,,,,,,,,,,,,,,,
3,belt,[CLS],I,want,a,black,[MASK],.,[SEP],,,,,,,,,,,,,,,,,,
4,bodies,[CLS],Take,off,your,clothes,",",and,Put,salt,and,vinegar,on,your,[MASK],…,[SEP],,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,traditional,[CLS],They,’re,watching,henmen,",",a,[MASK],Chinese,art,.,[SEP],,,,,,,,,,,,,,
6417,traditional,[CLS],I,think,manga,and,anime,will,be,[MASK],Japanese,arts,like,ukiyo,-e,someday,.,[SEP],,,,,,,,,
6418,traditional,[CLS],Kyoto,is,an,old,city,with,many,[MASK],buildings,.,[SEP],,,,,,,,,,,,,,
6419,traditional,[CLS],Feel,the,[MASK],rhythms,created,by,technology,.,[SEP],,,,,,,,,,,,,,,,


## **今回作成したファイル**

In [118]:
textbook_sentence_maskedword_df = pd.concat([mask_model_df, textbook_maskedword_df], axis=1)

In [119]:
textbook_sentence_maskedword_df

Unnamed: 0,id,textBook,grade,articleId,Page,sentenceId,sentence,mask_id,masked_Word,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,92,NH,1,6,P38,7,I play the guitar in a band.,7,band,[CLS],I,play,the,guitar,in,a,[MASK],.,[SEP],,,,,,,,,,,,,,,,
1,1255,NH,3,11,P53,16,We could see a band of bright green lights in ...,5,band,[CLS],We,could,see,a,[MASK],of,bright,green,lights,in,the,sky,.,[SEP],,,,,,,,,,,
2,1088,NH,3,3,P16,13,It's made from beef.,5,beef,[CLS],It,'s,made,from,[MASK],.,[SEP],,,,,,,,,,,,,,,,,,
3,111,NH,1,6,P42,26,I want a black belt.,5,belt,[CLS],I,want,a,black,[MASK],.,[SEP],,,,,,,,,,,,,,,,,,
4,442,NH,1,32,P128,12,"Take off your clothes, and Put salt and vinega...",13,bodies,[CLS],Take,off,your,clothes,",",and,Put,salt,and,vinegar,on,your,[MASK],…,[SEP],,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6416,297,NH,1,21,P96,10,"They’re watching henmen, a traditional Chinese...",7,traditional,[CLS],They,’re,watching,henmen,",",a,[MASK],Chinese,art,.,[SEP],,,,,,,,,,,,,,
6417,1073,NH,3,2,P12,30,I think manga and anime will be traditional Ja...,8,traditional,[CLS],I,think,manga,and,anime,will,be,[MASK],Japanese,arts,like,ukiyo,-e,someday,.,[SEP],,,,,,,,,
6418,1233,NH,3,10,P50,2,Kyoto is an old city with many traditional bui...,8,traditional,[CLS],Kyoto,is,an,old,city,with,many,[MASK],buildings,.,[SEP],,,,,,,,,,,,,,
6419,1348,NH,3,15,P70,5,Feel the traditional rhythms created by techno...,3,traditional,[CLS],Feel,the,[MASK],rhythms,created,by,technology,.,[SEP],,,,,,,,,,,,,,,,


In [120]:
# CSVで出力し、ダウンロード
from google.colab import files
filename =  "textbook_sentence_maskedword_df-check.csv"
textbook_sentence_maskedword_df.to_csv(filename, encoding = 'utf-8-sig') 
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>