In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import chess.pgn
import re
import sys
import os.path
import pathlib
import logging
from datetime import datetime
import sys, traceback


def get_file_list(local_path):
    tree = os.walk(str(local_path))
    file_list = []
    out = []
    test = r'.+pgn$'
    for i in tree:
        file_list = i[2]

    for name in file_list:
        if(len(re.findall(test, name))):
            out.append(local_path +'\\'+ name)
    return out


def get_data(pgn_file):
    node = chess.pgn.read_game(pgn_file)
    while node is not None:
        data = node.headers

        data["moves"] = []

        while node.variations:
            next_node = node.variation(0)
            data["moves"].append(
                    re.sub("\{.*?\}", "", node.board().san(next_node.move)))
            node = next_node

        node = chess.pgn.read_game(pgn_file)

        out_dict = {}

        for key in data.keys():
            out_dict[key] = data.get(key)

        # log(data.get('Event'))
        yield out_dict


def convert_file(file_path):
    file_name = file_path.name.replace(file_path.suffix, '') + '.json'
    log('convert file '+file_path.name)
    out_list = []
    try:
        json_file = open(out_dir + '\\' + file_name, 'w')
        pgn_file = open(str(file_path), encoding='ISO-8859-1')

        for count_d, data in enumerate(get_data(pgn_file), start=0):
            log(file_path.name+' '+str(count_d))
            out_list.append(data)

        log(' save '+file_path.name)
        json.dump(out_list, json_file)
        json_file.close()
        log('done')
    except Exception as e:
        log(traceback.format_exc(10))
        log('ERROR file '+file_name+' not converted')


def create_join_file(file_list):
    log(' create_join_file ')
    name = out_dir +'\\'+ 'join_data.json'
    open(name, 'w').close()
    json_file = open(out_dir +'\\'+ 'join_data.json', 'a')
    json_file.write('[')
    for count_f, file in enumerate(file_list, start=0):
        pgn_file = open(file, encoding='ISO-8859-1')
        for count_d, data in enumerate(get_data(pgn_file), start=0):
            log(str(count_f)+' '+str(count_d))
            if count_f or count_d:
                json_file.write(',')
            data_str = json.dumps(data)
            json_file.write(data_str)
        log(pathlib.Path(file).name)
    json_file.write(']')
    json_file.close()

In [8]:
inp_dir = r'F:\db scacchi 5'
out_dir = inp_dir
log = logging.getLogger().error

In [11]:
get_file_list(inp_dir)

['F:\\db scacchi 5\\New Database_1000_1979.pgn']

In [10]:
file_list = get_file_list(inp_dir)
is_join = True
n_split_chunk = 10000

file = file_list[0]
pgn_file = open(file, encoding='ISO-8859-1')

count_n_split_chunk = 0
start_time = datetime.now()
name = out_dir +'\\'+ 'join_data'+str(count_n_split_chunk)+'.json'
open(name, 'w').close()
json_file = open(name, 'a')
json_file.write('[')
for count_d, data in enumerate(get_data(pgn_file), start=0):
    #log(str(count_f)+' '+str(count_d))
    if (count_d % n_split_chunk == 0) & (count_d != 0):
        json_file.write(']')
        json_file.close()
        end_time = datetime.now()
        log('time '+str(end_time-start_time))
        
        count_n_split_chunk = count_n_split_chunk + 1
        start_time = datetime.now()
        name = out_dir +'\\'+ 'join_data'+str(count_n_split_chunk)+'.json'
        open(name, 'w').close()
        json_file = open(name, 'a')
        json_file.write('[')
    if count_d % n_split_chunk != 0:
        json_file.write(',')
    data_str = json.dumps(data)
    json_file.write(data_str)
#log(pathlib.Path(file).name)
json_file.write(']')
json_file.close()
end_time = datetime.now()
log('time '+str(end_time-start_time))




time 0:04:47.948214
time 0:05:00.515481
time 0:04:51.011954
time 0:04:34.145368
time 0:04:10.481625
time 0:03:59.679248
time 0:03:51.273048
time 0:04:35.623203
time 0:05:10.693444
time 0:04:41.777706
time 0:04:49.710414
time 0:04:48.570858
time 0:04:57.713813
time 0:04:55.561790
time 0:05:03.127342
time 0:05:08.816302
time 0:04:50.110428
time 0:05:03.947343
time 0:05:30.225674
time 0:05:06.749757
time 0:05:26.385105
time 0:05:04.857902
time 0:05:07.028311
time 0:04:45.294336
time 0:05:00.376934
time 0:04:52.413672
time 0:04:50.249478
time 0:04:42.219607
time 0:04:46.637386
time 0:04:50.392673
time 0:05:01.708344
time 0:05:01.954711
time 0:04:05.568159
time 0:04:45.064951
time 0:04:48.597515
time 0:04:43.501112
time 0:04:55.741361
time 0:04:44.704917
time 0:04:47.879409
time 0:05:04.196704
time 0:04:54.862689
time 0:02:29.693783


In [6]:

json_file.close()