In [1]:
import chess
import chess.pgn
import chess.polyglot
import os
import sys
import time
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

data_path = r'E:\Projects\Chess\Lichess_data\\'
data_path2 = r'i:\Lichess\\'

In [2]:
def parse(games, path, name):
    # Создаем недостающие папки
    if not os.path.isdir(path):
        os.mkdir(path)
        os.mkdir(path+'pgn')
        os.mkdir(path+'pickle')
    else:
        if not os.path.isdir(path+'pgn'):
            os.mkdir(path+'pgn')
        if not os.path.isdir(path+'pickle'):
            os.mkdir(path+'pickle')
    # Открываем файл для записи "отсеяных" pgn
    new_pgn = open(path+'pgn\\'+name, "w", encoding="utf-8")
    exporter = chess.pgn.FileExporter(new_pgn)
    # Создаем массивы под необходимые данные
    res = []
    w_elos = []
    b_elos = []
    utc_date = []
    eco = []
    opening = []
    time_control = []
    termination = []
    for g in games:
        w_elos.append(g.headers['WhiteElo'])
        b_elos.append(g.headers['BlackElo'])
        utc_date.append(datetime.strptime(g.headers['UTCDate'], '%Y.%m.%d'))
        eco.append(g.headers['ECO'])
        opening.append(g.headers['Opening'])
        time_control.append(g.headers['TimeControl'])
        termination.append(g.headers['Termination'])
        r = g.headers['Result']
        if r == '1/2-1/2':
            res.append(0.5)
        elif r == '1-0':
            res.append(1)
        else:
            res.append(0)
        g.accept(exporter)
    # Преобразуем полученные данные в датафреймы и объединяем их
    elos = pd.DataFrame(np.vstack((w_elos, b_elos)).T,columns=['ELO_white','ELO_black'])
    utc_date = pd.DataFrame(utc_date, columns=['UTCDate'])
    eco = pd.DataFrame(eco, columns=['ECO'])
    opening = pd.DataFrame(opening, columns=['Opening'])
    time_control = pd.DataFrame(time_control, columns=['TimeControl'])
    termination = pd.DataFrame(termination, columns=['Termination'])
    results = pd.DataFrame(res, columns=['Result'])
    data = elos.join(results, how='outer')\
        .join(eco).join(opening)\
        .join(utc_date).join(time_control)\
        .join(termination)#.fillna(method='pad')
    # Сохраняем результат
    data.to_pickle(path+'pickle\\'+name+'.pickle')
    return data

In [3]:
def load_games(path_to_file, name, n_games=sys.maxsize):
    # если существуют сохраненные файлы, то загружаем данные из них
    main_dir = 'stockfish\\'
    if os.path.exists(path_to_file+main_dir+'pickle\\'+name+'.pickle'):
        with open(path_to_file+main_dir+'pickle\\'+name+'.pickle', 'rb') as f:
            parsed_data = pd.read_pickle(f)
        return parsed_data
    # если файлов нет, то выполняем обработку данных
    # Для начала считываем данные о играх
    with open(path_to_file+name) as data:
        game = chess.pgn.read_game(data)
        games = []
        count = 0
        while game is not None and count<n_games:
            if game.variations != [] and game.variation(0).eval() is not None:
                games.append(game)
            game = chess.pgn.read_game(data)
            count += 1
    # Далее обрабатываем данные
    parsed_data = parse(games, path_to_file+main_dir, name)
    return parsed_data

In [4]:
lichess_pgns = []
for f_name in os.listdir(data_path):
    if f_name.endswith('.pgn'):
        lichess_pgns.append(f_name)
lichess_pgns

['lichess_db_standard_rated_2014-08.pgn',
 'lichess_db_standard_rated_2014-09.pgn',
 'lichess_db_standard_rated_2014-10.pgn',
 'lichess_db_standard_rated_2014-11.pgn',
 'lichess_db_standard_rated_2015-01.pgn',
 'lichess_db_standard_rated_2015-02.pgn',
 'lichess_db_standard_rated_2015-03.pgn',
 'lichess_db_standard_rated_2015-04.pgn',
 'lichess_db_standard_rated_2015-05.pgn',
 'lichess_db_standard_rated_2015-06.pgn',
 'lichess_db_standard_rated_2015-07.pgn',
 'lichess_db_standard_rated_2015-08.pgn',
 'lichess_db_standard_rated_2015-09.pgn',
 'lichess_db_standard_rated_2015-10.pgn',
 'lichess_db_standard_rated_2015-11.pgn',
 'lichess_db_standard_rated_2015-12.pgn',
 'lichess_db_standard_rated_2016-01.pgn',
 'lichess_db_standard_rated_2016-02.pgn',
 'lichess_db_standard_rated_2016-03.pgn',
 'lichess_db_standard_rated_2016-04.pgn',
 'lichess_db_standard_rated_2016-06.pgn']

In [5]:
for name in lichess_pgns:
    start = time.time()
    _ = load_games(data_path, name)
    print(time.time()-start)

0.14399981498718262
0.0690000057220459
0.09099984169006348
0.14099979400634766
0.20900225639343262
0.167999267578125
0.17399883270263672
0.1529998779296875
0.19100093841552734
0.24200034141540527
0.3189988136291504
0.3392155170440674
0.36899590492248535
0.4649999141693115
0.3900008201599121
0.41899895668029785
0.6009962558746338
0.5600008964538574
0.6370198726654053
0.5839982032775879
0.5905046463012695


In [6]:
def filter_pickle(read_path, name, write_path=data_path, n_games=sys.maxsize):
    write_path += 'stockfish\\top_pickle'
    # создаем недостающие папки
    if not os.path.isdir(write_path):
        os.mkdir(write_path)
    # Проверяем, есть ли уже "обработанный" файл
    if os.path.exists(write_path+'\\'+name):
        with open(write_path+'\\'+name, 'rb') as f:
            parsed_data = pd.read_pickle(f)
        return parsed_data
    with open(read_path+name, 'rb') as f:
        # Считываем датафрейм
        parsed_data = pd.read_pickle(f)
        # Сохраняем индексы, по ним будет оставлять игры из pgn-файлов
        parsed_data['old_index'] = parsed_data.index
        # Удаляем данные с пропущенными значениями
        parsed_data.TimeControl = parsed_data.TimeControl.replace('-', None)
        parsed_data = parsed_data.replace('?', None).dropna()
        # Преобразуем ELO к числовому типу
        parsed_data.ELO_white = parsed_data.ELO_white.astype('float')
        parsed_data.ELO_black = parsed_data.ELO_black.astype('float')
        # Оставляем только партии с рейтингом выше 2400
        parsed_data = parsed_data[(parsed_data.ELO_white>2400) | (parsed_data.ELO_black>2400)].reset_index()
        # Категоризуем время партий
        for i in range(parsed_data.shape[0]):
            ti = int(parsed_data.loc[i,'TimeControl'].split('+')[0])
            if ti<180:
                parsed_data.loc[i,'TimeControl'] = 'bullet'
            elif ti<600:
                parsed_data.loc[i,'TimeControl'] = 'blitz'
            elif ti<1800:
                parsed_data.loc[i,'TimeControl'] = 'rapid'
            else:
                parsed_data.loc[i,'TimeControl'] = 'classical'
        parsed_data = parsed_data.drop('index', axis=1)
        # Сохраняем результат
        parsed_data.to_pickle(write_path+'\\'+name)
        return parsed_data

In [7]:
filtered_pickle = []
read_pickle_path = data_path+'stockfish\\pickle\\'
for f_name in os.listdir(read_pickle_path):
    if f_name.endswith('.pickle'):
        filtered_pickle.append(f_name)
filtered_pickle

['lichess_db_standard_rated_2013-01.pgn.pickle',
 'lichess_db_standard_rated_2013-02.pgn.pickle',
 'lichess_db_standard_rated_2013-03.pgn.pickle',
 'lichess_db_standard_rated_2013-04.pgn.pickle',
 'lichess_db_standard_rated_2013-05.pgn.pickle',
 'lichess_db_standard_rated_2013-06.pgn.pickle',
 'lichess_db_standard_rated_2013-07.pgn.pickle',
 'lichess_db_standard_rated_2013-08.pgn.pickle',
 'lichess_db_standard_rated_2013-09.pgn.pickle',
 'lichess_db_standard_rated_2013-10.pgn.pickle',
 'lichess_db_standard_rated_2013-11.pgn.pickle',
 'lichess_db_standard_rated_2013-12.pgn.pickle',
 'lichess_db_standard_rated_2014-01.pgn.pickle',
 'lichess_db_standard_rated_2014-02.pgn.pickle',
 'lichess_db_standard_rated_2014-03.pgn.pickle',
 'lichess_db_standard_rated_2014-04.pgn.pickle',
 'lichess_db_standard_rated_2014-05.pgn.pickle',
 'lichess_db_standard_rated_2014-06.pgn.pickle',
 'lichess_db_standard_rated_2014-07.pgn.pickle',
 'lichess_db_standard_rated_2014-08.pgn.pickle',
 'lichess_db_standar

In [8]:
top_pickle = []
for name in filtered_pickle:
    start = time.time()
    top_pickle.append(filter_pickle(read_pickle_path, name))
    print(time.time()-start)

0.023998737335205078
0.010998010635375977
0.005000114440917969
0.004000425338745117
0.00700068473815918
0.008001089096069336
0.0019979476928710938
0.011001348495483398
0.00500035285949707
0.011999845504760742
0.006000041961669922
0.009999990463256836
0.013000011444091797
0.008998870849609375
0.008002519607543945
0.012998580932617188
0.012999534606933594
0.008000612258911133
0.006000041961669922
0.008999824523925781
0.006000041961669922
0.004000186920166016
0.016000032424926758
0.012001514434814453
0.010998249053955078
0.01400136947631836
0.013998746871948242
0.01099848747253418
0.009001493453979492
0.013001441955566406
0.010997295379638672
0.01000070571899414
0.01900029182434082
0.012998104095458984
0.01399993896484375
0.031000375747680664
0.014000654220581055
0.024001359939575195
0.021998882293701172
0.009002923965454102


In [9]:
def filter_pgn2(read_path, name, indexes, write_path=data_path):
    write_path += 'stockfish\\top_pgn'
    # создаем недостающие папки
    if not os.path.isdir(write_path):
        os.mkdir(write_path)
    # Проверяем, есть ли уже "обработанный" файл
    if os.path.exists(write_path+'\\'+name):
        return
    # Открываем файл для записи "отсеяных" pgn
    new_pgn = open(write_path+'\\'+name, "w", encoding="utf-8")
    exporter = chess.pgn.FileExporter(new_pgn)
    try:
        with open(read_path+name) as f:
            # Считываем pgns
            game = chess.pgn.read_game(f)
            i = 0
            while game is not None:
                if i in indexes:
                    game.accept(exporter)
                i += 1
                game = chess.pgn.read_game(f)
    except:
        with open(read_path+name, encoding="utf-8") as f:
            # Считываем pgns
            game = chess.pgn.read_game(f)
            i = 0
            while game is not None:
                if i in indexes:
                    game.accept(exporter)
                i += 1
                game = chess.pgn.read_game(f)
    return

In [10]:
filtered_pgns = []
read_pgn_path = data_path+'stockfish\\pgn\\'
for f_name in os.listdir(read_pgn_path):
    if f_name.endswith('.pgn'):
        filtered_pgns.append(f_name)
filtered_pgns

['lichess_db_standard_rated_2013-01.pgn',
 'lichess_db_standard_rated_2013-02.pgn',
 'lichess_db_standard_rated_2013-03.pgn',
 'lichess_db_standard_rated_2013-04.pgn',
 'lichess_db_standard_rated_2013-05.pgn',
 'lichess_db_standard_rated_2013-06.pgn',
 'lichess_db_standard_rated_2013-07.pgn',
 'lichess_db_standard_rated_2013-08.pgn',
 'lichess_db_standard_rated_2013-09.pgn',
 'lichess_db_standard_rated_2013-10.pgn',
 'lichess_db_standard_rated_2013-11.pgn',
 'lichess_db_standard_rated_2013-12.pgn',
 'lichess_db_standard_rated_2014-01.pgn',
 'lichess_db_standard_rated_2014-02.pgn',
 'lichess_db_standard_rated_2014-03.pgn',
 'lichess_db_standard_rated_2014-04.pgn',
 'lichess_db_standard_rated_2014-05.pgn',
 'lichess_db_standard_rated_2014-06.pgn',
 'lichess_db_standard_rated_2014-07.pgn',
 'lichess_db_standard_rated_2014-08.pgn',
 'lichess_db_standard_rated_2014-09.pgn',
 'lichess_db_standard_rated_2014-10.pgn',
 'lichess_db_standard_rated_2014-11.pgn',
 'lichess_db_standard_rated_2015-0

In [11]:
i = 0
for name in filtered_pgns:
    start = time.time()
    filter_pgn2(read_pgn_path, name, list(top_pickle[i].old_index.values))
    i += 1
    print(time.time()-start)

0.0010008811950683594
0.0
0.0
0.0
0.0010001659393310547
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0010013580322265625
0.0
0.0
0.0
0.0009989738464355469
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0009999275207519531
0.0
0.0
1456.0792078971863
0.0010020732879638672
0.0
0.0009996891021728516
0.0
0.00099945068359375
0.0


In [12]:
def filter_pickle2(read_path, name, write_path=data_path, n_games=sys.maxsize):
    write_path += 'stockfish\\top_both_pickle'
    # создаем недостающие папки
    if not os.path.isdir(write_path):
        os.mkdir(write_path)
    # Проверяем, есть ли уже "обработанный" файл
    if os.path.exists(write_path+'\\'+name):
        with open(write_path+'\\'+name, 'rb') as f:
            parsed_data = pd.read_pickle(f)
        return parsed_data
    with open(read_path+name, 'rb') as f:
        # Считываем датафрейм
        parsed_data = pd.read_pickle(f)
        # Сохраняем индексы, по ним будет оставлять игры из pgn-файлов
        parsed_data['old_index'] = parsed_data.index
        # Оставляем только партии с рейтингом выше 2400
        parsed_data = parsed_data[(parsed_data.ELO_white>2400) & (parsed_data.ELO_black>2400)].reset_index()
        parsed_data = parsed_data.drop('index', axis=1)
        # Сохраняем результат
        parsed_data.to_pickle(write_path+'\\'+name)
        return parsed_data

In [13]:
top_pickles = []
read_pickle_path = data_path+'stockfish\\top_pickle\\'
for f_name in os.listdir(read_pickle_path):
    if f_name.endswith('.pickle'):
        top_pickles.append(f_name)
top_pickles

['lichess_db_standard_rated_2013-01.pgn.pickle',
 'lichess_db_standard_rated_2013-02.pgn.pickle',
 'lichess_db_standard_rated_2013-03.pgn.pickle',
 'lichess_db_standard_rated_2013-04.pgn.pickle',
 'lichess_db_standard_rated_2013-05.pgn.pickle',
 'lichess_db_standard_rated_2013-06.pgn.pickle',
 'lichess_db_standard_rated_2013-07.pgn.pickle',
 'lichess_db_standard_rated_2013-08.pgn.pickle',
 'lichess_db_standard_rated_2013-09.pgn.pickle',
 'lichess_db_standard_rated_2013-10.pgn.pickle',
 'lichess_db_standard_rated_2013-11.pgn.pickle',
 'lichess_db_standard_rated_2013-12.pgn.pickle',
 'lichess_db_standard_rated_2014-01.pgn.pickle',
 'lichess_db_standard_rated_2014-02.pgn.pickle',
 'lichess_db_standard_rated_2014-03.pgn.pickle',
 'lichess_db_standard_rated_2014-04.pgn.pickle',
 'lichess_db_standard_rated_2014-05.pgn.pickle',
 'lichess_db_standard_rated_2014-06.pgn.pickle',
 'lichess_db_standard_rated_2014-07.pgn.pickle',
 'lichess_db_standard_rated_2014-08.pgn.pickle',
 'lichess_db_standar

In [14]:
top_top_pickle = []
for name in top_pickles:
    start = time.time()
    top_top_pickle.append(filter_pickle2(read_pickle_path, name))
    print(time.time()-start)

0.017302274703979492
0.03036355972290039
0.010843992233276367
0.006001472473144531
0.00545954704284668
0.004570722579956055
0.010015726089477539
0.006211519241333008
0.006531953811645508
0.0030100345611572266
0.009000539779663086
0.0030012130737304688
0.005974292755126953
0.029036760330200195
0.00917816162109375
0.011031150817871094
0.0062983036041259766
0.002987384796142578
0.01100015640258789
0.001999378204345703
0.008475065231323242
0.009252309799194336
0.009000778198242188
0.01274418830871582
0.010300874710083008
0.006014823913574219
0.012000083923339844
0.012144327163696289
0.004999399185180664
0.009001493453979492
0.013025522232055664
0.006242275238037109
0.017007827758789062
0.011028766632080078
0.018821001052856445
0.02935314178466797
0.01803112030029297
0.017158031463623047
0.01502847671508789
0.013576745986938477


In [15]:
def filter_pgn3(read_path, name, indexes, write_path=data_path):
    write_path += 'stockfish\\top_both_pgn'
    # создаем недостающие папки
    if not os.path.isdir(write_path):
        os.mkdir(write_path)
    # Проверяем, есть ли уже "обработанный" файл
    if os.path.exists(write_path+'\\'+name):
        return
    # Открываем файл для записи "отсеяных" pgn
    new_pgn = open(write_path+'\\'+name, "w", encoding="utf-8")
    exporter = chess.pgn.FileExporter(new_pgn)
    try:
        with open(read_path+name) as f:
            # Считываем pgns
            game = chess.pgn.read_game(f)
            i = 0
            while game is not None:
                if i in indexes:
                    game.accept(exporter)
                i += 1
                game = chess.pgn.read_game(f)
    except:
        with open(read_path+name, encoding="utf-8") as f:
            # Считываем pgns
            game = chess.pgn.read_game(f)
            i = 0
            while game is not None:
                if i in indexes:
                    game.accept(exporter)
                i += 1
                game = chess.pgn.read_game(f)
    return

In [16]:
filtered_top_pgns = []
read_pgn_path = data_path+'stockfish\\top_pgn\\'
for f_name in os.listdir(read_pgn_path):
    if f_name.endswith('.pgn'):
        filtered_top_pgns.append(f_name)
filtered_top_pgns

['lichess_db_standard_rated_2013-01.pgn',
 'lichess_db_standard_rated_2013-02.pgn',
 'lichess_db_standard_rated_2013-03.pgn',
 'lichess_db_standard_rated_2013-04.pgn',
 'lichess_db_standard_rated_2013-05.pgn',
 'lichess_db_standard_rated_2013-06.pgn',
 'lichess_db_standard_rated_2013-07.pgn',
 'lichess_db_standard_rated_2013-08.pgn',
 'lichess_db_standard_rated_2013-09.pgn',
 'lichess_db_standard_rated_2013-10.pgn',
 'lichess_db_standard_rated_2013-11.pgn',
 'lichess_db_standard_rated_2013-12.pgn',
 'lichess_db_standard_rated_2014-01.pgn',
 'lichess_db_standard_rated_2014-02.pgn',
 'lichess_db_standard_rated_2014-03.pgn',
 'lichess_db_standard_rated_2014-04.pgn',
 'lichess_db_standard_rated_2014-05.pgn',
 'lichess_db_standard_rated_2014-06.pgn',
 'lichess_db_standard_rated_2014-07.pgn',
 'lichess_db_standard_rated_2014-08.pgn',
 'lichess_db_standard_rated_2014-09.pgn',
 'lichess_db_standard_rated_2014-10.pgn',
 'lichess_db_standard_rated_2014-11.pgn',
 'lichess_db_standard_rated_2015-0

In [17]:
i = 0
for name in filtered_top_pgns:
    start = time.time()
    filter_pgn3(read_pgn_path, name, list(top_top_pickle[i].old_index.values))
    i += 1
    print(time.time()-start)

0.0
0.0
0.0
0.0009963512420654297
0.0
0.0
0.0
0.0009999275207519531
0.0
0.0
0.0
0.0010001659393310547
0.0
0.0
0.0
0.0010004043579101562
0.0
0.0
0.0009987354278564453
0.0
0.0
0.0
0.0010008811950683594
0.0
0.0
0.0
0.001001119613647461
0.0
0.0
0.0
0.000997781753540039
0.0
0.0
21.234129667282104
0.0010030269622802734
0.0
0.0009982585906982422
0.0
0.0
0.0029985904693603516


In [20]:
# Проверяю, чтобы в записанных ".pickle" были те же данные,
# что и в ".pgn".
top_top_pickle[-1].head()

Unnamed: 0,ELO_white,ELO_black,Result,ECO,Opening,UTCDate,TimeControl,Termination,old_index
0,2422.0,2610.0,0.0,C00,"French Defense: Horwitz Attack, Papa-Ticulat G...",2016-05-31,blitz,Normal,2
1,2616.0,2416.0,1.0,E20,Nimzo-Indian Defense: Kmoch Variation,2016-05-31,blitz,Normal,3
2,2411.0,2622.0,0.0,C03,"French Defense: Tarrasch Variation, Guimard De...",2016-05-31,blitz,Normal,4
3,2481.0,2432.0,1.0,D07,"Queen's Gambit Refused: Chigorin Defense, Main...",2016-05-31,blitz,Normal,5
4,2401.0,2633.0,0.0,D04,Queen's Pawn Game: Colle System,2016-05-31,blitz,Time forfeit,7


In [19]:
with open(data_path+r'stockfish\top_both_pgn\lichess_db_standard_rated_2016-06.pgn') as f:
    # Считываем pgns
    game = chess.pgn.read_game(f)
    i = 0
    while game is not None and i < 5:
        i += 1
        print(game.headers['WhiteElo'],
        game.headers['BlackElo'],
        game.headers['ECO'],
        game.headers['TimeControl'])
        game = chess.pgn.read_game(f)

2422 2610 C00 180+0
2616 2416 E20 180+0
2411 2622 C03 180+0
2481 2432 D07 180+0
2401 2633 D04 180+0
