In [9]:
import urllib.request
import json
import time
import glob
import pandas as pd

In [10]:
start = '2020-01-01'
end = '2020-12-31'
meetings = ['本会議', '経済産業委員会']
houses = ['参議院', '衆議院']

In [11]:
def create_url(start, end, meetings, houses, start_record=1):
    meetings = ' '.join(meetings)
    houses = ' '.join(houses)
    url = 'from={}&until={}&nameOfMeeting={}&nameOfHouse={}&startRecord={}&recordPacking=json'.\
    format(start, end, meetings, houses, start_record)
    url = 'https://kokkai.ndl.go.jp/api/meeting?{}'.format(urllib.parse.quote(url))
    return url

In [12]:
def record_nums(json_data):
    '''
    numberOfRecords: 総結果件数
    numberOfReturn: 返戻件数
    startRecord: 開始位置
    nextRecordPosition: 次開始位置（※存在する場合のみ）
    '''
    return json_data['numberOfRecords'], \
           json_data['numberOfReturn'], \
           json_data['startRecord'], \
           json_data['nextRecordPosition']

In [13]:
url = create_url(start, end,  meetings, houses)
try:
    res = urllib.request.Request(url) #urlリクエスト
    # url リクエストを開く
    with urllib.request.urlopen(res) as r:
        # jsonとして開く
        json_data = json.load(r)
        # jsonのヘッダ部分のみを取得する
        records_num, return_num, start_record, next_position = record_nums(json_data)
        print('records_num:{} startRecord:{}'.format(records_num, start_record))
        while next_position is not None:
            # 会議単位で保管するためにmeetingRecordごとに保管する
            for content in json_data['meetingRecord']:
                # とりたい会議以外が取れる場合があるので、その場合は除く
                if content['nameOfMeeting'] not in meetings:
                    continue
                
                # 参議院と衆議院でファイル名を分ける
                file_path = './input/{}_sanin_{}.json' if content['nameOfHouse'] == '参議院' \
                       else './input/{}_syuin_{}.json'
                file_path = file_path.format(content['date'], content['nameOfMeeting'])
                # jsonを保管する
                with open(file_path, 'w') as f:
                    content = json.dumps(content)
                    f.write(content)
                    
                time.sleep(2)
            
            # 次のurlを作成し、リクエストを投げる
            url = create_url(start, end,  meetings, houses, next_position)
            res = urllib.request.Request(url)
            with urllib.request.urlopen(res) as r:
                json_data = json.load(r)
                records_num, return_num, start_record, next_position = record_nums(json_data)
                print('records_num:{} startRecord:{}'.format(records_num, start_record))
        else:
            print('done!')
                    
except urllib.error.HTTPError as err:
    print(err.code)
except urllib.error.URLError as err:
    print(err.reason)

records_num:120 startRecord:1
records_num:120 startRecord:4
records_num:120 startRecord:7
records_num:120 startRecord:10
records_num:120 startRecord:13
records_num:120 startRecord:16
records_num:120 startRecord:19
records_num:120 startRecord:22
records_num:120 startRecord:25
records_num:120 startRecord:28
records_num:120 startRecord:31
records_num:120 startRecord:34
records_num:120 startRecord:37
records_num:120 startRecord:40
records_num:120 startRecord:43
records_num:120 startRecord:46
records_num:120 startRecord:49
records_num:120 startRecord:52
records_num:120 startRecord:55
records_num:120 startRecord:58
records_num:120 startRecord:61
records_num:120 startRecord:64
records_num:120 startRecord:67
records_num:120 startRecord:70
records_num:120 startRecord:73
records_num:120 startRecord:76
records_num:120 startRecord:79
records_num:120 startRecord:82
records_num:120 startRecord:85
records_num:120 startRecord:88
records_num:120 startRecord:91
records_num:120 startRecord:94
records_num

In [14]:
file_paths = glob.glob('./input/*.json')

In [15]:
kokkai_list = []
for file_path in file_paths:
    with open(file_path, 'r') as f:
        json_data = json.load(f)
        date = json_data['date']
        house = json_data['nameOfHouse']
        meeting = json_data['nameOfMeeting']
        texts = [[date, house, meeting, x['speechOrder'], x['speech']] for x in json_data['speechRecord']]
        kokkai_list.extend(texts)

In [16]:
df_kokkai = pd.DataFrame(kokkai_list, columns=['date', 'house', 'meeting', 'speech_order', 'text'])
df_kokkai.to_csv('./data/kokkai.csv', index=False)