# Full set up

### packages

In [None]:
!apt-get -qq install zstd

Selecting previously unselected package zstd.
(Reading database ... 128288 files and directories currently installed.)
Preparing to unpack .../zstd_1.4.4+dfsg-3ubuntu0.1_amd64.deb ...
Unpacking zstd (1.4.4+dfsg-3ubuntu0.1) ...
Setting up zstd (1.4.4+dfsg-3ubuntu0.1) ...
Processing triggers for man-db (2.9.1-1) ...


In [None]:
!pip install -q chess

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/149.1 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m143.4/149.1 KB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.1/149.1 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import chess
import chess.engine
import chess.pgn

import pandas as pd
import numpy as np

from tqdm import tqdm, trange

### lichess database: https://database.lichess.org/

In [None]:
!wget --no-verbose https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst

2023-03-31 08:11:34 URL:https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst [17761302/17761302] -> "lichess_db_standard_rated_2013-01.pgn.zst" [1]


In [None]:
!unzstd lichess_db_standard_rated_2013-01.pgn.zst -o lichess_db.pgn

lichess_db_standard_rated_2013-01.pgn.zst: 92811021 bytes 


### Stockfish: https://stockfishchess.org/download/linux/

In [None]:
!lscpu | grep 'Model name'

Model name:                      Intel(R) Xeon(R) CPU @ 2.20GHz


In [None]:
!wget --no-verbose https://stockfishchess.org/files/stockfish_15.1_linux_x64_avx2.zip

2023-03-31 08:11:37 URL:https://stockfishchess.org/files/stockfish_15.1_linux_x64_avx2.zip [27166402/27166402] -> "stockfish_15.1_linux_x64_avx2.zip" [1]


In [None]:
!unzip stockfish_15.1_linux_x64_avx2.zip stockfish_15.1_linux_x64_avx2/stockfish-ubuntu-20.04-x86-64-avx2

Archive:  stockfish_15.1_linux_x64_avx2.zip
  inflating: stockfish_15.1_linux_x64_avx2/stockfish-ubuntu-20.04-x86-64-avx2  


# Collect database

In [None]:
PIECES = [chess.PAWN, chess.KNIGHT, chess.BISHOP, chess.ROOK, chess.QUEEN, chess.KING]
PLAYERS = [chess.WHITE, chess.BLACK]

fix_names_func = {chess.WHITE: lambda x: x.upper(), chess.BLACK: lambda x: x.lower()}

In [None]:
def encode_board(board):
    encoding = []
    for player in PLAYERS:
        for piece in PIECES:
            encoding.extend(board.pieces(piece, player).tolist())
        encoding.append(board.has_kingside_castling_rights(player))
        encoding.append(board.has_queenside_castling_rights(player))
    return encoding

def get_columns_names():
    columns = []
    for player in PLAYERS:
        for piece in PIECES:
            symbol = fix_names_func[player](chess.piece_symbol(piece))
            columns.extend(map(lambda square: f"{square}{symbol}", chess.SQUARE_NAMES))
        columns.append(fix_names_func[player]("K"))
        columns.append(fix_names_func[player]("Q"))
    return columns

In [None]:
pgn = open("lichess_db.pgn")
engine = chess.engine.SimpleEngine.popen_uci("stockfish_15.1_linux_x64_avx2/stockfish-ubuntu-20.04-x86-64-avx2")

white = []
black = []

MATE_SCORE = 1000000
MAX_TIME = 1.0
MIN_DEPTH = 18
TIME_LIMIT = chess.engine.Limit(time=MAX_TIME)
DEPTH_LIMIT = chess.engine.Limit(depth=MIN_DEPTH)

DROP = 100
TAKE = 100

for _ in range(DROP):
    game = chess.pgn.read_game(pgn)

def board_score(board):
    info = engine.analyse(board, TIME_LIMIT)
    if info['depth'] < MIN_DEPTH and not info['score'].is_mate():
        info = engine.analyse(board, DEPTH_LIMIT)
    return info['score'].relative.score(mate_score=MATE_SCORE)

for i in trange(TAKE):
    game = chess.pgn.read_game(pgn)
    if game is None:
        break
    board = game.board()
    for move in game.mainline_moves():
        board.push(move)
        score = board_score(board)
        row = encode_board(board)
        row.append(score)
        if board.turn == chess.WHITE:
            white.append(row)
        else:
            black.append(row)

100%|██████████| 100/100 [1:49:49<00:00, 65.90s/it]


In [None]:
pgn.close()
engine.close()

In [None]:
columns = get_columns_names()
columns.append('score')
white_df = pd.DataFrame(white, columns=columns)
black_df = pd.DataFrame(black, columns=columns)
white_df.shape[0], black_df.shape[0]

(3114, 3178)

In [None]:
white_df

Unnamed: 0,a1P,b1P,c1P,d1P,e1P,f1P,g1P,h1P,a2P,b2P,...,b8k,c8k,d8k,e8k,f8k,g8k,h8k,k,q,score
0,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,46
1,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,83
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,77
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,152
4,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,True,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,134
3110,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,320
3111,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,999998
3112,False,False,False,False,False,False,False,False,True,True,...,False,False,False,True,False,False,False,True,True,174


In [None]:
white_df.to_csv(f'white_{DROP+1}-{DROP+TAKE}.csv', index=False, header=True)

In [None]:
black_df.to_csv(f'black_{DROP+1}-{DROP+TAKE}.csv', index=False, header=True)

# Quick set up

In [None]:
!pip install -q chess

In [None]:
import chess
import chess.engine
import chess.pgn

In [None]:
PIECES = [chess.PAWN, chess.KNIGHT, chess.BISHOP, chess.ROOK, chess.QUEEN, chess.KING]
PLAYERS = [chess.WHITE, chess.BLACK]

fix_names_func = {chess.WHITE: lambda x: x.upper(), chess.BLACK: lambda x: x.lower()}

In [None]:
def get_columns_names():
    columns = []
    for player in PLAYERS:
        for piece in PIECES:
            symbol = fix_names_func[player](chess.piece_symbol(piece))
            columns.extend(map(lambda square: f"{square}{symbol}", chess.SQUARE_NAMES))
        columns.append(fix_names_func[player]("K"))
        columns.append(fix_names_func[player]("Q"))
    return columns

# Read database

In [None]:
import pandas as pd
import numpy as np

In [None]:
CHUNKS = 2

white_frames = []
black_frames = []

for i in range(CHUNKS):
    white_frames.append(pd.read_csv(f'white_{i*100+1}-{(i+1)*100}.csv'))
    black_frames.append(pd.read_csv(f'black_{i*100+1}-{(i+1)*100}.csv'))

white_all_df = pd.concat(white_frames).dropna()
black_all_df = pd.concat(black_frames).dropna()

white_all_df.shape[0], black_all_df.shape[0]

(6182, 6303)

In [None]:
columns = get_columns_names()

white_df = white_all_df.drop_duplicates(subset=columns)
black_df = black_all_df.drop_duplicates(subset=columns)

white_df.shape[0], black_df.shape[0]

(5840, 5865)

In [None]:
SCORE_THRESHOLD = 100
white_dataset = white_df[(white_df['score'] >= SCORE_THRESHOLD) | (white_df['score'] <= -SCORE_THRESHOLD)]
black_dataset = black_df[(black_df['score'] >= SCORE_THRESHOLD) | (black_df['score'] <= -SCORE_THRESHOLD)]

white_dataset.shape[0], black_dataset.shape[0]

(3864, 3785)

# Classification

In [None]:
from sklearn.model_selection import train_test_split

### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

def test_lr(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", LogisticRegression(random_state=1, max_iter=250).fit(X_trn, y_trn).score(X_tst, y_tst))


In [None]:
test_lr(white_dataset)
test_lr(black_dataset)

score: 0.8683385579937304
score: 0.8648


### SVC

In [None]:
from sklearn.svm import SVC

def test_svc_poly(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", SVC(kernel='poly', degree=2).fit(X_trn, y_trn).score(X_tst, y_tst))

def test_svc_rbf(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", SVC(kernel='rbf', gamma='scale', C=30).fit(X_trn, y_trn).score(X_tst, y_tst)) 

In [None]:
test_svc_poly(white_dataset)
test_svc_poly(black_dataset)

score: 0.9255485893416928
score: 0.9232


In [None]:
test_svc_rbf(white_dataset)
test_svc_rbf(black_dataset)

score: 0.9600313479623824
score: 0.9592


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def test_rf(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", RandomForestClassifier(random_state=1, n_estimators=200).fit(X_trn, y_trn).score(X_tst, y_tst))

In [None]:
test_rf(white_dataset)
test_rf(black_dataset)

score: 0.9474921630094044
score: 0.9576


### xgboost

In [None]:
from xgboost import XGBClassifier

def test_xgb(dataset):
    X = dataset.drop(['score'], axis=1)
    Y = dataset['score'] >= SCORE_THRESHOLD
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, Y, test_size=0.33, random_state=1)
    print("score:", XGBClassifier().fit(X_trn, y_trn).score(X_tst, y_tst))

In [None]:
test_xgb(white_dataset)
test_xgb(black_dataset)

score: 0.9263322884012539
score: 0.9432


# Statistics

In [None]:
white_all_df.shape

(6182, 773)

In [None]:
def group(df):
    columns = get_columns_names()
    grouped = df.groupby(columns).agg({'score': ['min', 'max']})
    grouped.columns = map(lambda x: x[1], grouped.columns)
    grouped.index = np.arange(len(grouped.index))
    return grouped[grouped['min'] != grouped['max']]

In [None]:
grouped = group(white_all_df)
diff = pd.DataFrame(grouped['max'] - grouped['min'], columns=['diff'])
grouped

Unnamed: 0,min,max
310,1398,1491
342,368,418
497,-22,0
499,-38,0
820,350,363
...,...,...
5825,20,52
5826,51,72
5827,23,56
5829,14,24


In [None]:
diff.sort_values('diff', axis=0)

Unnamed: 0,diff
5422,1
5810,1
3990,1
5413,1
5777,1
...,...
5825,32
5827,33
499,38
342,50
