## 90. データの準備
<p>機械翻訳のデータセットをダウンロードせよ．訓練データ，開発データ，評価データを整形し，必要に応じてトークン化などの前処理を行うこと．ただし，この段階ではトークンの単位として形態素（日本語）および単語（英語）を採用せよ．</p>

In [1]:
if __name__ == '__main__':
    !ls kftt-data-1.0* > /dev/null 2>&1 || wget -q http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz
    !ls kftt-data-1.0.tar.gz > /dev/null 2>&1 && tar -xzf kftt-data-1.0.tar.gz && rm kftt-data-1.0.tar.gz

In [2]:
if __name__ == '__main__':
    !pip install -U ginza -q

In [3]:
if __name__ == '__main__':
    from spacy.lang.en import English
    import spacy
    en_nlp = English()
    ja_nlp = spacy.load('ja_ginza')

In [4]:
if __name__ == '__main__':
    root = "kftt-data-1.0/data/"
    orig = root + "orig/"
    tok2 = root + "tok2/"
    import os;
    if not os.path.exists(tok2):
        os.makedirs(tok2)
        for lang in ["ja","en"]:
            if lang == "ja": nlp = ja_nlp
            if lang == "en": nlp = en_nlp
            for mode in ["train","dev","test"]:
                with open(f"{orig}kyoto-{mode}.{lang}") as rf, open(f"{tok2}kyoto-{mode}.{lang}", "w") as wf:
                    for line in rf:
                        line = line.rstrip()
                        wf.write(" ".join([str(w) for w in nlp(line)])+"\n")

In [5]:
class DataAnalysis:
    def __init__(self, lang="en", modes:list=["train","dev"], prepath="kftt-data-1.0/data/tok2/"):
        self.paths = {
            lang: { mode: f"{prepath}kyoto-{mode}.{lang}" for mode in ["train","dev","test"] }
            for lang in ["ja","en"]
        }
        self.set(lang, modes)
        
    def set(self, lang="en", modes:list=["train","dev"]):
        self.lang = lang
        self.modes = modes
        return self
    
    def counts(self, tups): return sum(map(lambda x:x[1], tups))
    
    def word_counts(self):
        dic = {}
        for mode in self.modes:
            with open(self.paths[self.lang][mode]) as f:
                for line in f:
                    line = line.rstrip().split(' ')
                    for elm in line:
                        dic.setdefault(elm,0)
                        dic[elm] += 1
        return dic
    
    def length_counts(self):
        dic = {}
        for mode in self.modes:
            with open(self.paths[self.lang][mode]) as f:
                for line in f:
                    line = line.rstrip().split(' ')
                    dic.setdefault(len(line),0)
                    dic[len(line)] += 1
        return dic        
    
    def show_thr_list(self, dic, comment, thrs:list):
        print(self.lang, self.modes)
        lis = sorted(dic.items(),key=lambda x:x[1],reverse=True)
        thrs = sorted(thrs)
        for thr in thrs:
            left, _ = self.split_list_by_thr(lis, thr/100)
            print(" {:>3}%: {}:{}".format(thr,comment,len(left)))
    
    def show_word_coverage(self):
        thrs = list(range(60,100,10)) + list(range(91,100,1)) + [100]
        self.show_thr_list(self.word_counts(), "dicsize", thrs)
    
    def show_sentence_lengths(self):
        thrs = list(range(70,100,20)) + list(range(91,100,2)) + [i/10 for i in range(991,1000,1)] + [100]
        self.show_thr_list(self.length_counts(), "datasize", thrs)
        
    def split_list_by_thr(self, lis, thr, sums=None):
        if len(lis) <= 1: return lis, []
        if sums is None: sums = self.counts(lis)
        left, right = lis[:len(lis)//2], lis[len(lis)//2:]
        l_cov, r_cov = self.counts(left)/sums, self.counts(right)/sums
        if l_cov < thr:
            l,right = self.split_list_by_thr(right, thr-l_cov, sums)
            left = left+l
        else:
            left,r = self.split_list_by_thr(left, thr, sums)
            right = r+right
        return left, right
    
    def create_dictionary(self, thr, count=True, start=100, others={}):
        lis = sorted(self.word_counts().items(),key=lambda x:x[1],reverse=True)
        lis = [l for l in lis if l[0] not in others.keys()]
        left, right = self.split_list_by_thr(lis, thr)
        if count : return {k:v for k,v in left}
        dic = {k:v for k,v in others.items()}
        for i,(word,_) in enumerate(left,start): dic[word] = i
        return dic
    
import os
class DataAnalysisWide(DataAnalysis):
    def __init__(self, prepath="kftt-data-1.0/data/tok2/"):
        prepath = prepath.rstrip('/')
        files = os.listdir(prepath)
        langs = set([f.split(".")[-1] for f in files])
        modes = set([".".join(f.split(".")[:-1]) for f in files])
        self.paths = {lang: {mode: f"{prepath}/{mode}.{lang}" for mode in modes} for lang in langs}

In [6]:
if __name__ == '__main__':
    da = DataAnalysis()

In [7]:
if __name__ == '__main__':
    da.set("ja",["train"]).show_word_coverage()

ja ['train']
  60%: dicsize:176
  70%: dicsize:792
  80%: dicsize:2849
  90%: dicsize:10275
  91%: dicsize:11913
  92%: dicsize:13911
  93%: dicsize:16394
  94%: dicsize:19553
  95%: dicsize:23667
  96%: dicsize:29224
  97%: dicsize:37302
  98%: dicsize:50167
  99%: dicsize:75420
 100%: dicsize:154658


In [8]:
if __name__ == '__main__':
    ja_dic = da.create_dictionary(0.95)
    len(ja_dic)

23667

In [9]:
if __name__ == '__main__':
    da.set("en",["train"]).show_word_coverage()

en ['train']
  60%: dicsize:260
  70%: dicsize:794
  80%: dicsize:2365
  90%: dicsize:8305
  91%: dicsize:9686
  92%: dicsize:11402
  93%: dicsize:13570
  94%: dicsize:16360
  95%: dicsize:20076
  96%: dicsize:25327
  97%: dicsize:33352
  98%: dicsize:47353
  99%: dicsize:79852
 100%: dicsize:181128


In [10]:
if __name__ == '__main__':
    en_dic = da.create_dictionary(0.95)

In [11]:
if __name__ == '__main__':
    len(en_dic)

20076

In [12]:
if __name__ == '__main__':
    escapes = {"[PAD]":0, "[MASK]":1, "[BOS]":10, "[EOS]":11, "[UNK]":12}
    ja_dic = da.set("ja",["train"]).create_dictionary(0.95, count=False, start=100, others=escapes)
    en_dic = da.set("en",["train"]).create_dictionary(0.95, count=False, start=100, others=escapes)

In [13]:
if __name__ == '__main__':
    sorted(ja_dic.items(), key=lambda x:x[1])[:10]

[('[PAD]', 0),
 ('[MASK]', 1),
 ('[BOS]', 10),
 ('[EOS]', 11),
 ('[UNK]', 12),
 ('の', 100),
 ('、', 101),
 ('に', 102),
 ('。', 103),
 ('は', 104)]

In [14]:
if __name__ == '__main__':
    sorted(en_dic.items(), key=lambda x:x[1])[:10]

[('[PAD]', 0),
 ('[MASK]', 1),
 ('[BOS]', 10),
 ('[EOS]', 11),
 ('[UNK]', 12),
 ('the', 100),
 (',', 101),
 ('of', 102),
 ('.', 103),
 ('and', 104)]

In [15]:
if __name__ == '__main__':
    import pickle
    with open('model_logs/ja_dic.pickle', 'wb') as f: pickle.dump(ja_dic, f)
    with open('model_logs/en_dic.pickle', 'wb') as f: pickle.dump(en_dic, f)

In [16]:
if __name__ == '__main__':
    idpath = "kftt-data-1.0/data/ids/"
    if not os.path.exists(idpath):
        os.makedirs(idpath, exist_ok=True)
        for lang, v in da.paths.items():
            if lang == "ja": dic = ja_dic
            if lang == "en": dic = en_dic
            for mode, path in v.items():
                with open(path) as rf, open(f"{idpath}kyoto-{mode}.{lang}", "w") as wf:
                    for line in rf:
                        line = line.rstrip().split(" ")
                        line = ["[BOS]"]+[w if w in dic else "[UNK]" for w in line]+["[EOS]"]
                        wf.write(" ".join([str(dic[w]) for w in line]) + "\n")

In [17]:
if __name__ == '__main__':
    da.set("ja",["train"]).show_sentence_lengths()

ja ['train']
  70%: datasize:31
  90%: datasize:48
  91%: datasize:50
  93%: datasize:53
  95%: datasize:58
  97%: datasize:65
  99%: datasize:82
 99.1%: datasize:84
 99.2%: datasize:85
 99.3%: datasize:88
 99.4%: datasize:90
 99.5%: datasize:93
 99.6%: datasize:97
 99.7%: datasize:101
 99.8%: datasize:109
 99.9%: datasize:121
 100%: datasize:209


In [18]:
if __name__ == '__main__':
    da.set("en",["train"]).show_sentence_lengths()

en ['train']
  70%: datasize:33
  90%: datasize:52
  91%: datasize:54
  93%: datasize:58
  95%: datasize:64
  97%: datasize:73
  99%: datasize:93
 99.1%: datasize:95
 99.2%: datasize:97
 99.3%: datasize:99
 99.4%: datasize:102
 99.5%: datasize:106
 99.6%: datasize:111
 99.7%: datasize:117
 99.8%: datasize:125
 99.9%: datasize:141
 100%: datasize:249


In [1]:
if __name__ == '__main__':
    !jupyter nbconvert --to python Chapter10_90.ipynb
    !cp Chapter10_90.py DataAnalysis.py

[NbConvertApp] Converting notebook Chapter10_90.ipynb to python
[NbConvertApp] Writing 6979 bytes to Chapter10_90.py


### 戻る
[Chapter10.ipynb](./Chapter10.ipynb)