In [1]:
import pandas as pd
import json
import os
import re
from tqdm.notebook import tqdm

In [2]:
def conv_str(x):
    if x == '':
        return ''
    elif pd.notnull(x):
        try:
            if '_' in x:
                return x
            else:
                return str(int(float(x)))
        except:
            return x
    else:
        return ''


In [3]:
all_law_list = pd.read_csv('./all_xml/all_law_list.csv')
# 未施行のものはいったん無視する
all_law_list = all_law_list[all_law_list['未施行'].isna()]
# 法令番号をキーにできるようにする
all_law_list.set_index('法令番号',inplace=True)

In [4]:
all_xml = pd.read_csv('all_xml.csv',dtype=str)
all_xml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1267891 entries, 0 to 1267890
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   lawNum     1267891 non-null  object
 1   provision  1267891 non-null  object
 2   article    1267891 non-null  object
 3   paragraph  1267891 non-null  object
 4   item       1267891 non-null  object
 5   text       1267344 non-null  object
dtypes: object(6)
memory usage: 58.0+ MB


In [5]:
reference = pd.read_csv('reference.csv',converters={'ref.lawArticle.article':conv_str, 'ref.lawArticle.paragraph':conv_str, 'ref.lawArticle.item':conv_str,'referred.lawArticle.article':conv_str, 'referred.lawArticle.paragraph':conv_str, 'referred.lawArticle.item':conv_str})
reference.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480071 entries, 0 to 480070
Data columns (total 11 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   ref.lawNum                     480071 non-null  object
 1   ref.lawArticle.provision       480071 non-null  object
 2   ref.lawArticle.article         480071 non-null  object
 3   ref.lawArticle.paragraph       480071 non-null  object
 4   ref.lawArticle.item            480071 non-null  object
 5   ref.text                       480071 non-null  object
 6   referred.lawNum                480071 non-null  object
 7   referred.lawArticle.provision  480071 non-null  object
 8   referred.lawArticle.article    480071 non-null  object
 9   referred.lawArticle.paragraph  480071 non-null  object
 10  referred.lawArticle.item       480071 non-null  object
dtypes: object(11)
memory usage: 40.3+ MB


In [6]:
reference_text = reference.merge(all_xml,
                                 how='left', 
                                 left_on=['referred.lawNum', 'referred.lawArticle.provision', 'referred.lawArticle.article', 'referred.lawArticle.paragraph', 'referred.lawArticle.item'],
                                 right_on=['lawNum', 'provision', 'article', 'paragraph', 'item'])

In [7]:
reference_text.drop(['lawNum', 'provision', 'article', 'paragraph', 'item'],axis=1,inplace=True)
reference_text.rename(columns={'text':'referred.text'},inplace=True)
reference_text = reference_text[reference_text['referred.text'].notna()]
reference_text.drop_duplicates(inplace=True)
reference_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 389661 entries, 0 to 480070
Data columns (total 12 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   ref.lawNum                     389661 non-null  object
 1   ref.lawArticle.provision       389661 non-null  object
 2   ref.lawArticle.article         389661 non-null  object
 3   ref.lawArticle.paragraph       389661 non-null  object
 4   ref.lawArticle.item            389661 non-null  object
 5   ref.text                       389661 non-null  object
 6   referred.lawNum                389661 non-null  object
 7   referred.lawArticle.provision  389661 non-null  object
 8   referred.lawArticle.article    389661 non-null  object
 9   referred.lawArticle.paragraph  389661 non-null  object
 10  referred.lawArticle.item       389661 non-null  object
 11  referred.text                  389661 non-null  object
dtypes: object(12)
memory usage: 38.6+ MB


In [8]:
# ～を定める政令となっているものは法令上で「政令で定める」となっているため、何かの法令を参照しているはず。該当する法令を抽出する
seirei_law_list = all_law_list[all_law_list['法令名'].str.contains('定める')]
seirei_law = seirei_law_list.index.to_list()

In [9]:
ref_seirei = reference_text[reference_text['ref.lawNum'].isin(seirei_law)]
ref_seirei = ref_seirei.merge(all_law_list['法令名'],left_on='ref.lawNum',right_index=True)
ref_seirei.rename(columns={'法令名':'ref.lawName'},inplace=True)
ref_seirei = ref_seirei.merge(all_law_list['法令名'],left_on='referred.lawNum',right_index=True)
ref_seirei.rename(columns={'法令名':'referred.lawName'},inplace=True)

In [10]:
def all_common_substrings(str1, str2,keyword='令で定める'):
    common_substrings = ['',]
    len1, len2 = len(str1), len(str2)
    match1 = re.finditer(keyword,str1)
    match2 = re.finditer(keyword,str2)

    # 「○令で定める～」にヒットする場所をすべて抽出し、前後の文言が一致しているところを抽出する
    for m1 in match1:
        for m2 in match2:
            pos1 = m1.start()
            pos2 = m2.start()
            pre_position = -1
            # str1とstr2の部分文字列を比較
            for pre in range(pos1+1):
                # 1文字ずつ「令に定める」の前に位置をずらしていき、一致する位置を記憶する
                if str1[pos1-pre:pos1+5] == str2[pos2-pre:pos2+5]:
                    pre_position = pre
                else:
                    break
            if pre_position >= 0:
                for post in range(len1-pos1-5+1):
                    # 1文字ずつ「令に定める」の後に位置をずらしていき、一致する位置を記憶する
                    if str1[pos1-pre_position:pos1+5+post] == str2[pos2-pre_position:pos2+5+post]:
                        # 一致したらリストに追加する
                        common_substrings.append(str1[pos1-pre_position:pos1+5+post])
                    else:
                        break
    # リストの中で最長の単語を抜き出す
    return max(common_substrings,key=len)

In [11]:
# 政令で定める、省令で定める、府令で定める、命令で定める、規則で定めるという記載があるものをマッチング
from anyio import key


reference_text_seirei = reference_text[(reference_text['ref.text'].str.contains('で定める'))&(reference_text['referred.text'].str.contains('で定める'))]
reference_text_seirei['match'] = reference_text_seirei.apply(lambda row:all_common_substrings(row['ref.text'],row['referred.text'],keyword='令で定める'), axis=1)
reference_text_seirei.loc[reference_text_seirei['match']=='','match'] = reference_text_seirei[reference_text_seirei['match']==''].apply(lambda row:all_common_substrings(row['ref.text'],row['referred.text'],keyword='規則で定める'), axis=1)
reference_text_seirei = reference_text_seirei[reference_text_seirei['match']!='']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reference_text_seirei['match'] = reference_text_seirei.apply(lambda row:all_common_substrings(row['ref.text'],row['referred.text'],keyword='令で定める'), axis=1)


In [12]:
len(reference_text_seirei)

44238

In [13]:
# 政省令名が「～を定める○○」というものについては、何かの法令と関連するはずなので紐づける
ref_seirei = ref_seirei[ref_seirei['ref.text'].str.contains('で定める')]
ref_seirei['match'] = ref_seirei.apply(lambda row:all_common_substrings(row['ref.text'],row['referred.text'],keyword='令で定める'), axis=1)
ref_seirei.loc[ref_seirei['match']=='','match'] = ref_seirei[ref_seirei['match']==''].apply(lambda row:all_common_substrings(row['ref.text'],row['referred.text'],keyword='規則で定める'), axis=1)
ref_seirei.loc[ref_seirei['match']=='','match'] = ref_seirei[ref_seirei['match']==''].apply(lambda row:all_common_substrings(row['ref.lawName'],row['referred.text'],keyword='令で定める'), axis=1)
ref_seirei.loc[ref_seirei['match']=='','match'] = ref_seirei[ref_seirei['match']==''].apply(lambda row:all_common_substrings(row['ref.lawName'],row['referred.text'],keyword='規則で定める'), axis=1)
ref_seirei.drop(['ref.lawName','referred.lawName'],axis=1,inplace=True)

In [14]:
len(ref_seirei)

2101

In [15]:
reference_text_seirei = pd.concat([reference_text_seirei, ref_seirei])
reference_text_seirei.drop_duplicates(inplace=True)

In [16]:
len(reference_text_seirei)

44804

In [17]:
l = ['1','2','3']

In [18]:
# ソート用の関数を定義
def convert_to_sortable_number(article):
    if '_' in article:
        # 「_」で分割し、主番号と枝番号を分ける
        nums = article.split('_')
        return float(f"{nums[0]}."+''.join([f'{int(nums[i]):03d}' for i in range(1,len(nums))]))
    else:
        # 「_」がない場合は整数に変換
        return float(article)

In [19]:
# マッチした結果をもとのデータに戻す
reference_text['match'] = reference_text_seirei['match']
reference_text['match'].fillna('★引用個所不明★',inplace=True) # 法令で出てこないであろう★マークで埋める（アプリでは文末に「引用条文」の項目を付加することにする）

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reference_text['match'].fillna('★引用個所不明★',inplace=True) # 法令で出てこないであろう★マークで埋める（アプリでは文末に「引用条文」の項目を付加することにする）


In [20]:
# のちに作成するjsonのデータのためにデータをソートする
sort_row = [
    'ref.lawArticle.article',
    'ref.lawArticle.paragraph',
    'ref.lawArticle.item',
    'referred.lawArticle.article',
    'referred.lawArticle.paragraph',
    'referred.lawArticle.item',
]
# ソートキーを追加
sort_row_key = [
    'referred.lawNum',
    'referred.lawArticle.provision',
    'referred.lawArticle.article_key',
    'referred.lawArticle.paragraph_key',
    'referred.lawArticle.item_key',
    'ref.lawNum',
    'ref.lawArticle.provision',
    'ref.lawArticle.article_key',
    'ref.lawArticle.paragraph_key',
    'ref.lawArticle.item_key',
]
drop_key = [
    'ref.lawArticle.article_key',
    'ref.lawArticle.paragraph_key',
    'ref.lawArticle.item_key',
    'referred.lawArticle.article_key',
    'referred.lawArticle.paragraph_key',
    'referred.lawArticle.item_key',
]
for r in sort_row:
    reference_text[r + '_key'] = reference_text[r].apply(convert_to_sortable_number)
# ソートの実行
reference_text = reference_text.sort_values(by=sort_row_key).reset_index(drop=True)

# ソートキーの削除
reference_text = reference_text.drop(columns=drop_key)

In [21]:
# ネストされたキーを復元する関数
def nest_dict(flat_dict):
    result = {}
    for key, value in flat_dict.items():
        keys = key.split('.')
        d = result
        for k in keys[:-1]:
            d = d.setdefault(k, {})
        d[keys[-1]] = value
    return result

In [22]:
# サブフォルダの存在確認と作成
subfolder = './ref_json'
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

In [23]:
for lawnum in tqdm(reference_text['referred.lawNum'].unique()):
    reference_json = reference_text[reference_text['referred.lawNum']==lawnum].to_json(orient='records')
    reference_dict = json.loads(reference_json)
    # フラット化されたデータを再度ネスト
    nested_data = [nest_dict(d) for d in reference_dict]
    # 再びJSON形式に変換
    with open(f'{subfolder}/{lawnum}.json',mode='w',encoding='utf-8') as f:
        json.dump(nested_data, f, ensure_ascii=False, indent=4)

  0%|          | 0/3779 [00:00<?, ?it/s]