In [1]:
import os
import time
import numpy as np
import pandas as pd
from elo import *

data_path = r'E:\Projects\Chess\Lichess_data\stockfish\\'

In [2]:
top_pgns = []
read_pgn_path = data_path+'top_pgn\\'
for f_name in os.listdir(read_pgn_path):
    if f_name.endswith('.pgn'):
        top_pgns.append(f_name)
top_pgns[0], top_pgns[-1]

('lichess_db_standard_rated_2013-01.pgn',
 'lichess_db_standard_rated_2016-06.pgn')

In [3]:
top_pickle = []
read_pickle_path = data_path+'top_pickle\\'
for f_name in os.listdir(read_pickle_path):
    if f_name.endswith('.pickle'):
        top_pickle.append(f_name)
top_pickle[0], top_pickle[-1]

('lichess_db_standard_rated_2013-01.pgn.pickle',
 'lichess_db_standard_rated_2016-06.pgn.pickle')

### Game features

In [4]:
if not os.path.isdir(data_path+'game_features\\'):
    os.mkdir(data_path+'game_features\\')
if not os.path.isdir(data_path+'stockfish_features\\'):
    os.mkdir(data_path+'stockfish_features\\')
for p in top_pgns:
    filename = data_path+'game_features\\'+p.split('.')[0]
    stockfish_filename = data_path+'stockfish_features\\'+p.split('.')[0]
    if os.path.exists(stockfish_filename+'.pickle'):
        print(p +' is already processed...')
        continue
    else:
        # Засекаем время начала выполнения
        start = time.time()
        # Считываем все игры
        games = get_games(read_pgn_path+p)
        # Для игр вычисляем ключевые параметры
        game_features = games_features(games)
        print(time.time()-start)
        # сохраняем результаты
        game_features.to_pickle(filename+'.pickle')
        game_features.to_csv(filename+'.csv', index=False)
        
        # Вычисляем характеристики по данным стокфиша
        if game_features.shape[0]==0:
            sc_features = pd.DataFrame()
        else:
            sc_features = score_features(game_features.stockfish_scores.values)
        print(time.time()-start)
        # Сохраняем результат
        sc_features.to_pickle(stockfish_filename+'.pickle')
        sc_features.to_csv(stockfish_filename+'.csv', index=False)

lichess_db_standard_rated_2013-01.pgn is already processed...
lichess_db_standard_rated_2013-02.pgn is already processed...
lichess_db_standard_rated_2013-03.pgn is already processed...
lichess_db_standard_rated_2013-04.pgn is already processed...
lichess_db_standard_rated_2013-05.pgn is already processed...
lichess_db_standard_rated_2013-06.pgn is already processed...
lichess_db_standard_rated_2013-07.pgn is already processed...
lichess_db_standard_rated_2013-08.pgn is already processed...
lichess_db_standard_rated_2013-09.pgn is already processed...
lichess_db_standard_rated_2013-10.pgn is already processed...
lichess_db_standard_rated_2013-11.pgn is already processed...
lichess_db_standard_rated_2013-12.pgn is already processed...
lichess_db_standard_rated_2014-01.pgn is already processed...
lichess_db_standard_rated_2014-02.pgn is already processed...
lichess_db_standard_rated_2014-03.pgn is already processed...
lichess_db_standard_rated_2014-04.pgn is already processed...
lichess_

### Объединение данных

In [32]:
all_features_path = data_path+'all_features\\all_features.pickle'
if os.path.exists(all_features_path):
    with open(all_features_path, 'rb') as f:
        X = pd.read_pickle(f)
else:
    sc_features_list = []
    game_features_list = []
    data_list = []
    for p in top_pickle:
        filename = data_path+'game_features\\'+p.split('.')[0]
        stockfish_filename = data_path+'stockfish_features\\'+p.split('.')[0]
        # Считываем вычисленные характеристики по партиям и стокфишу
        gm_df = pd.read_pickle(filename+'.pickle')
        sc_df = pd.read_pickle(stockfish_filename+'.pickle')
        data_df = pd.read_pickle(data_path+'top_pickle\\'+p)
        # Пропускаю пустые датафреймы
        if gm_df.shape[0]==0:
            continue
        # Иногда, после преобразований получаются массивы свойств другого размера
        # Поэтому пока добавлю условие одинаковой размерности
        if gm_df.shape[0] == sc_df.shape[0] == data_df.shape[0]:
            game_features_list.append(gm_df)
            sc_features_list.append(sc_df)
            data_list.append(data_df)
    
    sc_features = pd.concat(sc_features_list, ignore_index=True)
    game_features = pd.concat(game_features_list, ignore_index=True)
    game_features = game_features.drop('stockfish_scores', axis=1)
    data = pd.concat(data_list, ignore_index=True)
    data = data.drop('old_index', axis=1)
    # считываем дебюты и преобразуем их в категориальную переменную
    ecos = pd.get_dummies(data.ECO.values)
    TimeControl = pd.get_dummies(data.TimeControl.values)
    Termination = pd.get_dummies(data.Termination.values)
    openings = pd.get_dummies(data.Opening.values)
    # считываем дебюты и преобразуем результаты в категориальную переменную
    game_results = pd.get_dummies(data.Result)#.reset_index(drop=True)
    X = pd.concat([game_features, sc_features, game_results, ecos, TimeControl, Termination, openings], axis=1, ignore_index=True)
    X = X.dropna().reset_index()
    data = data.iloc[X.index].reset_index(drop=True)
    data.to_pickle(data_path+'data.pickle')
    X = X.drop('index',axis=1)
    X.to_pickle(all_features_path)
    print(game_features.shape, sc_features.shape, game_results.shape, ecos.shape, TimeControl.shape, Termination.shape, openings.shape)

(108184, 30) (108184, 32) (108184, 3) (108184, 483) (108184, 4) (108184, 2) (108184, 2362)


In [33]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2906,2907,2908,2909,2910,2911,2912,2913,2914,2915
0,5.0,0.0,17.0,0.0,0.0,1.0,1.0,7.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,11.0,0.0,12.0,0.0,0.0,1.0,2.0,6.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,7.0,30.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,10.0,0.0,1.0,1.0,0.0,4.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,8.0,0.0,15.0,0.0,1.0,1.0,1.0,4.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108145,0.0,0.0,10.0,0.0,0.0,1.0,1.0,3.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
108146,7.0,0.0,15.0,16.0,0.0,1.0,0.0,2.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
108147,5.0,0.0,12.0,0.0,0.0,1.0,1.0,6.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
108148,0.0,0.0,21.0,0.0,1.0,1.0,0.0,5.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
for i in range(len(sc_features_list)):
    if sc_features_list[i].shape[0] != data_list[i].shape[0]:
        print(i)

<font size =6>Повторим все вышеперечисленные операции для данных, где оба игрока с рейтингами 2400+

In [8]:
top_both_pgns = []
read_pgn_path = data_path+'top_both_pgn\\'
for f_name in os.listdir(read_pgn_path):
    if f_name.endswith('.pgn'):
        top_both_pgns.append(f_name)
top_both_pgns[0], top_both_pgns[-1]

('lichess_db_standard_rated_2013-01.pgn',
 'lichess_db_standard_rated_2016-06.pgn')

In [9]:
top_both_pickle = []
read_pickle_path = data_path+'top_both_pickle\\'
for f_name in os.listdir(read_pickle_path):
    if f_name.endswith('.pickle'):
        top_both_pickle.append(f_name)
top_both_pickle[0], top_both_pickle[-1]

('lichess_db_standard_rated_2013-01.pgn.pickle',
 'lichess_db_standard_rated_2016-06.pgn.pickle')

In [10]:
if not os.path.isdir(data_path+'top_game_features\\'):
    os.mkdir(data_path+'top_game_features\\')
if not os.path.isdir(data_path+'top_stockfish_features\\'):
    os.mkdir(data_path+'top_stockfish_features\\')
for p in top_pgns:
    filename = data_path+'top_game_features\\'+p.split('.')[0]
    stockfish_filename = data_path+'top_stockfish_features\\'+p.split('.')[0]
    if os.path.exists(stockfish_filename+'.pickle'):
        print(p +' is already processed...')
        continue
    else:
        # Засекаем время начала выполнения
        start = time.time()
        # Считываем все игры
        games = get_games(read_pgn_path+p)
        # Для игр вычисляем ключевые параметры
        game_features = games_features(games)
        print(time.time()-start)
        # сохраняем результаты
        game_features.to_pickle(filename+'.pickle')
        game_features.to_csv(filename+'.csv', index=False)
        
        # Вычисляем характеристики по данным стокфиша
        if game_features.shape[0]==0:
            sc_features = pd.DataFrame()
        else:
            sc_features = score_features(game_features.stockfish_scores.values)
        print(time.time()-start)
        # Сохраняем результат
        sc_features.to_pickle(stockfish_filename+'.pickle')
        sc_features.to_csv(stockfish_filename+'.csv', index=False)

lichess_db_standard_rated_2013-01.pgn is already processed...
lichess_db_standard_rated_2013-02.pgn is already processed...
lichess_db_standard_rated_2013-03.pgn is already processed...
lichess_db_standard_rated_2013-04.pgn is already processed...
lichess_db_standard_rated_2013-05.pgn is already processed...
lichess_db_standard_rated_2013-06.pgn is already processed...
lichess_db_standard_rated_2013-07.pgn is already processed...
lichess_db_standard_rated_2013-08.pgn is already processed...
lichess_db_standard_rated_2013-09.pgn is already processed...
lichess_db_standard_rated_2013-10.pgn is already processed...
lichess_db_standard_rated_2013-11.pgn is already processed...
lichess_db_standard_rated_2013-12.pgn is already processed...
lichess_db_standard_rated_2014-01.pgn is already processed...
lichess_db_standard_rated_2014-02.pgn is already processed...
lichess_db_standard_rated_2014-03.pgn is already processed...
lichess_db_standard_rated_2014-04.pgn is already processed...
lichess_

In [28]:
all_features_path = data_path+'all_features\\top_all_features.pickle'
if os.path.exists(all_features_path):
    with open(all_features_path, 'rb') as f:
        X = pd.read_pickle(f)
else:
    sc_features_list = []
    game_features_list = []
    data_list = []
    for p in top_pickle:
        filename = data_path+'top_game_features\\'+p.split('.')[0]
        stockfish_filename = data_path+'top_stockfish_features\\'+p.split('.')[0]
        # Считываем вычисленные характеристики по партиям и стокфишу
        gm_df = pd.read_pickle(filename+'.pickle')
        sc_df = pd.read_pickle(stockfish_filename+'.pickle')
        data_df = pd.read_pickle(data_path+'top_both_pickle\\'+p)
        # Пропускаю пустые датафреймы
        if gm_df.shape[0]==0:
            continue
        # Иногда, после преобразований получаются массивы свойств другого размера
        # Поэтому пока добавлю условие одинаковой размерности
        if gm_df.shape[0] == sc_df.shape[0] == data_df.shape[0]:
            game_features_list.append(gm_df)
            sc_features_list.append(sc_df)
            data_list.append(data_df)
    
    sc_features = pd.concat(sc_features_list, ignore_index=True)
    game_features = pd.concat(game_features_list, ignore_index=True)
    game_features = game_features.drop('stockfish_scores', axis=1)
    data = pd.concat(data_list, ignore_index=True)
    data = data.drop('old_index', axis=1)
    # считываем дебюты и преобразуем их в категориальную переменную
    ecos = pd.get_dummies(data.ECO.values)
    TimeControl = pd.get_dummies(data.TimeControl.values)
    Termination = pd.get_dummies(data.Termination.values)
    openings = pd.get_dummies(data.Opening.values)
    # считываем дебюты и преобразуем результаты в категориальную переменную
    game_results = pd.get_dummies(data.Result)#.reset_index(drop=True)
    X = pd.concat([game_features, sc_features, game_results, ecos, TimeControl, Termination, openings], axis=1, ignore_index=True)
    X = X.dropna().reset_index()
    data = data.iloc[X.index].reset_index(drop=True)
    data.to_pickle(data_path+'top_data.pickle')
    X = X.drop('index',axis=1)
    X.to_pickle(all_features_path)
    print(game_features.shape, sc_features.shape, game_results.shape, ecos.shape, TimeControl.shape, Termination.shape, openings.shape)

(21636, 30) (21636, 32) (21636, 3) (21636, 437) (21636, 3) (21636, 2) (21636, 1696)


In [29]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2193,2194,2195,2196,2197,2198,2199,2200,2201,2202
0,7.0,0.0,0.0,14.0,36.0,1.0,1.0,0.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,0.0,0.0,20.0,30.0,0.0,1.0,0.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0.0,0.0,18.0,0.0,0.0,1.0,0.0,5.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,0.0,0.0,18.0,58.0,0.0,1.0,0.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,0.0,0.0,12.0,19.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21537,14.0,0.0,0.0,13.0,0.0,2.0,1.0,2.0,7.0,1.0,...,0,0,0,0,0,0,0,0,0,0
21538,6.0,0.0,0.0,14.0,0.0,1.0,1.0,1.0,6.0,1.0,...,0,0,0,0,0,0,0,0,0,0
21539,9.0,0.0,0.0,12.0,0.0,1.0,1.0,2.0,8.0,1.0,...,0,0,0,0,0,0,0,0,0,0
21540,13.0,0.0,0.0,19.0,0.0,1.0,1.0,0.0,3.0,1.0,...,0,0,0,0,0,0,0,0,0,0
