In [3]:
!pip install --upgrade pip
!pip install spacy
!pip install ginza
!pip install regex

Collecting ginza
  Downloading ginza-4.0.5.tar.gz (20 kB)
Collecting ja_ginza<4.1.0,>=4.0.0
  Downloading ja_ginza-4.0.0.tar.gz (51.5 MB)
[K     |████████████████████████████████| 51.5 MB 17.5 MB/s eta 0:00:01
[?25hCollecting SudachiPy>=0.4.9
  Using cached SudachiPy-0.5.1.tar.gz (69 kB)
Collecting SudachiDict-core>=20200330
  Downloading SudachiDict-core-20201223.post1.tar.gz (8.8 kB)
Collecting dartsclone~=0.9.0
  Downloading dartsclone-0.9.0-cp36-cp36m-manylinux1_x86_64.whl (474 kB)
[K     |████████████████████████████████| 474 kB 49.4 MB/s eta 0:00:01
Building wheels for collected packages: ginza, ja-ginza, SudachiDict-core, SudachiPy
  Building wheel for ginza (setup.py) ... [?25ldone
[?25h  Created wheel for ginza: filename=ginza-4.0.5-py3-none-any.whl size=15897 sha256=5b522d442a7f320a5cc379a5f729a0945c377c46300d98b4a2b90fd3f9f7c258
  Stored in directory: /home/ec2-user/.cache/pip/wheels/6d/9c/11/9efe0a85ed9334ca3dbebaebebfec9087220c02688b1308424
  Building wheel for ja-gin

In [8]:
import subprocess
from glob import glob

import pandas as pd
import regex
import spacy
from sklearn.model_selection import train_test_split

nlp = spacy.load('ja_ginza')

train_paths = glob('../data/input/train/*')
test_paths = glob('../data/input/test/*')

In [10]:
import subprocess
from glob import glob

import pandas as pd
import regex
import spacy
from sklearn.model_selection import train_test_split

nlp = spacy.load('ja_ginza')

train_paths = glob('../data/input/train/*')
test_paths = glob('../data/input/test/*')

dfs = []
for path in train_paths:
    df = pd.read_json(path, orient='records', lines=True)
    dfs.append(df)
train_df = pd.concat(dfs)

dfs = []
for path in test_paths:
    df = pd.read_json(path, orient='records', lines=True)
    dfs.append(df)
test_df = pd.concat(dfs)

# train, valの分割は、裁判種別と、ラベルの数の多いPERSON, ORGFACPOS, LOCATIONの数が同等程度に分かれるようにすることとする

for df in [train_df, test_df]:
    df['file_id'] = df['meta'].apply(lambda x: x['filename'].rstrip('_hanrei.txt')[1:]).map(int)
    df['category'] = df['meta'].apply(lambda x: x['category'])
    df['stratify'] = df['category'].apply(lambda x: 'その他' if x in ['労働事件裁判例', '高裁判例'] else x) # 裁判種別でtrain, valを分割。件数の少ない労働事件裁判例, 高裁判例はその他にまとめる
    df.drop(['meta', 'annotation_approver'], axis=1, inplace=True)
    df.sort_values('file_id', inplace=True)
    df.reset_index(drop=True, inplace=True)

In [11]:
df.head(3)

Unnamed: 0,id,text,labels,file_id,category,stratify
0,75,主文 原判決を破棄する。本件を東京地方裁判所に差し戻す。理由 本件控訴の趣意は 主任弁護人高...,,80874,高裁判例,その他
1,382,主 文 原判決を破棄する。 本件を名古屋地方裁判所に差し戻す。 理 由 第1 控訴趣意 検察...,,89243,下級裁裁判例,下級裁裁判例
2,373,令和2年2月20日判決言渡 平成31年(ネ)第10033号 パブリシティ権侵害等差止等・著作...,,89254,知的財産裁判例,知的財産裁判例


In [12]:
def count_tag(labels):
    """ラベル種類ごとにラベルの数をカウント"""
    dic = {}
    for label in labels:
        dic[label[2]] = dic.get(label[2], 0) + 1
    return dic


train_df['total_nlabel'] = train_df['labels'].apply(lambda x: len(x))
train_df['num_label'] = train_df['labels'].apply(count_tag)

tags = ['PERSON', 'ORGFACPOS', 'LOCATION', 'TIMEX', 'MISC']
tmp_df = train_df['num_label'].apply(pd.Series)[tags]
train_df = pd.concat([train_df, tmp_df] ,axis=1)
del train_df['num_label'], tmp_df

In [13]:
# 1レコードあたりのPERSON, ORGFACPOS, LOCATIONの数が同等程度に分かれる乱数シードを探索
min_ratios = []
min_diff = 10**5
min_seed = 0

In [14]:
for seed in range(100):
    train_ch_df, val_df = train_test_split(train_df, test_size=0.25, random_state=seed, stratify=train_df['stratify'])
    ratios = []
    for tag in ['PERSON', 'ORGFACPOS', 'LOCATION']:
        val_ntag_per_record = val_df[tag].sum()/val_df.shape[0]
        train_ntag_per_record = train_ch_df[tag].sum()/train_ch_df.shape[0]
        ratios.append(val_ntag_per_record / train_ntag_per_record)
    diff = sum([abs(1-ratio) for ratio in ratios])
    if diff < min_diff:
        min_ratios = ratios
        min_diff = diff
        min_seed = seed

print(min_ratios, min_diff, min_seed)

[1.1168838552931115, 1.0748905329910916, 0.9511084117825691] 0.24066597650163402 68


In [15]:
def format_iob(text, labels):
    """IOB2タグ形式でtokenごとにラベルを振り直す"""
    
    doc = nlp(text)

    output = [['', 'O', '']] # 前のラベルを見てB-かI-か決めるのでダミーのラベルを入れておく
    INF = 10**9
    labels.append([INF,INF,'']) # token.idxがラベルの終わり位置を超えていたら次のラベルの参照に移るので、ダミーのラベルを入れておき、位置を十分大きい値にしておく
    label_idx = 0
    label = labels[label_idx]

    for token in doc:
        # token.idxがラベルの終わり位置を超えていたら次のラベルの参照に移る
        if label[1] <= token.idx:
            label_idx += 1
            label = labels[label_idx]
        
        # token.idxがラベルの始まり位置と終わり位置の間にあったらラベルをつける。前のラベルと同じかどうかでB-かI-か決める
        if label[0] <= token.idx < label[1]:
            if output[-1][2] != label[2]:
                output.append([token.text, 'B', label[2]])
            else:
                output.append([token.text, 'I', label[2]])
        else:
            output.append([token.text, 'O', ''])

    return output[1:] # ダミーのラベルを除いて出力

In [16]:
%%time

tagged_tokens = []

texts = train_ch_df.text.values
labels_list = train_ch_df.labels.values
file_ids = train_ch_df.file_id.values

for text, labels in zip(texts, labels_list):
    output = format_iob(text, labels)
    output = '\n'.join([f'{l[0]} {l[1]}-{l[2]}' if l[1] != 'O' else f'{l[0]} {l[1]}' for l in output])
    tagged_tokens.append(output)

tagged_tokens = '\n\n'.join(tagged_tokens)

KeyboardInterrupt: 

In [None]:
# outputの確認
output.shape