In [17]:
import pandas as pd
import statistics as stats

# sklearn utility
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# sklearn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import export_text

# sklearn grid search
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

# sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import f1_score

In [18]:
df = pd.read_csv("base_chess_data.csv")
df = df.drop(["Unnamed: 0"], axis=1)
df.head()

Unnamed: 0,WhiteElo,BlackElo,WhiteWins,Moves,StockfishScores
0,2354,2411,0,Nf3 Nf6 c4 c5 b3 g6 Bb2 Bg7 e3 O-O Be2 b6 O-O ...,18 17 12 8 -5 12 3 -2 22 21 20 13 8 21 11 3 -6...
1,2523,2460,0,e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d6 Nf3 d5 Bd3 Nd6 O-O,26 44 26 18 14 34 36 31 37 35 42 52 55
2,1915,1999,-1,e4 d5 exd5 Nf6 d4 Nxd5 Nf3 g6 Be2 Bg7 c4 Nb6 N...,26 51 68 57 65 77 48 93 61 63 63 58 53 46 69 2...
3,2446,2191,1,c4 Nf6 Nc3 d6 d4 e5 Nf3 Nbd7 Bg5 Be7 e3 c6 Qc2...,2 21 5 53 35 45 37 54 10 22 8 48 30 17 13 35 -...
4,2168,2075,1,e4 c5 Nf3 d6 b4 Nf6 bxc5 Nxe4 cxd6 Qb6 d4 Bg4 ...,26 64 35 53 18 20 18 20 10 49 60 95 91 82 83 9...


In [31]:
# add number of moves to each game

game_num_moves = []

for index, row in df.iterrows():
    num_moves_by_both = len(row['Moves'].split(" "))
    num_white_moves = (num_moves_by_both // 2) + (num_moves_by_both % 2 == 1)
    game_num_moves.append(num_white_moves)


df['NumWhiteMoves'] = game_num_moves
df.head()

Unnamed: 0,WhiteElo,BlackElo,WhiteWins,Moves,StockfishScores,StockfishDeltas,NumWhiteMoves
0,2354,2411,0,Nf3 Nf6 c4 c5 b3 g6 Bb2 Bg7 e3 O-O Be2 b6 O-O ...,18 17 12 8 -5 12 3 -2 22 21 20 13 8 21 11 3 -6...,18 -1 -5 -4 -13 17 -9 -5 24 -1 -1 -7 -5 13 -10...,19
1,2523,2460,0,e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d6 Nf3 d5 Bd3 Nd6 O-O,26 44 26 18 14 34 36 31 37 35 42 52 55,26 18 -18 -8 -4 20 2 -5 6 -2 7 10 3,7
2,1915,1999,-1,e4 d5 exd5 Nf6 d4 Nxd5 Nf3 g6 Be2 Bg7 c4 Nb6 N...,26 51 68 57 65 77 48 93 61 63 63 58 53 46 69 2...,26 25 17 -11 8 12 -29 45 -32 2 0 -5 -5 -7 23 -...,53
3,2446,2191,1,c4 Nf6 Nc3 d6 d4 e5 Nf3 Nbd7 Bg5 Be7 e3 c6 Qc2...,2 21 5 53 35 45 37 54 10 22 8 48 30 17 13 35 -...,2 19 -16 48 -18 10 -8 17 -44 12 -14 40 -18 -13...,39
4,2168,2075,1,e4 c5 Nf3 d6 b4 Nf6 bxc5 Nxe4 cxd6 Qb6 d4 Bg4 ...,26 64 35 53 18 20 18 20 10 49 60 95 91 82 83 9...,26 38 -29 18 -35 2 -2 2 -10 39 11 35 -4 -9 1 1...,25


In [27]:
# There are many games where the only move made was the first one, and then the match was cancelled.
# We still want to remove those.

list_to_remove = []
for index, row in df.iterrows():
    if(len(row['Moves'].split(" ")) <= 2):
        list_to_remove.append(index)

df = df.drop(list_to_remove)



In [28]:
# add column for all deltas

# For moves with NA, we assume they have 0 change in the current board strength.

deltas = []

for index, row in df.iterrows():
    
    boardStrengths = row["StockfishScores"].split(" ")
    
    delta_string = ""
    current_board_strength = 0

    for score in boardStrengths:
        if(score == "NA"):
            delta_string += "0 "
            continue
        
        try:
            numeric_score = int(score)
        except:
            print(row["Moves"])
        delta = numeric_score - current_board_strength
        delta_string += str(delta) + " "

        current_board_strength = numeric_score

    delta_string = delta_string[:-1] # remove trailing whitespace
    
    deltas.append(delta_string)


df['StockfishDeltas'] = deltas

df.head()

In [30]:
df.head()

Unnamed: 0,WhiteElo,BlackElo,WhiteWins,Moves,StockfishScores,StockfishDeltas
0,2354,2411,0,Nf3 Nf6 c4 c5 b3 g6 Bb2 Bg7 e3 O-O Be2 b6 O-O ...,18 17 12 8 -5 12 3 -2 22 21 20 13 8 21 11 3 -6...,18 -1 -5 -4 -13 17 -9 -5 24 -1 -1 -7 -5 13 -10...
1,2523,2460,0,e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d6 Nf3 d5 Bd3 Nd6 O-O,26 44 26 18 14 34 36 31 37 35 42 52 55,26 18 -18 -8 -4 20 2 -5 6 -2 7 10 3
2,1915,1999,-1,e4 d5 exd5 Nf6 d4 Nxd5 Nf3 g6 Be2 Bg7 c4 Nb6 N...,26 51 68 57 65 77 48 93 61 63 63 58 53 46 69 2...,26 25 17 -11 8 12 -29 45 -32 2 0 -5 -5 -7 23 -...
3,2446,2191,1,c4 Nf6 Nc3 d6 d4 e5 Nf3 Nbd7 Bg5 Be7 e3 c6 Qc2...,2 21 5 53 35 45 37 54 10 22 8 48 30 17 13 35 -...,2 19 -16 48 -18 10 -8 17 -44 12 -14 40 -18 -13...
4,2168,2075,1,e4 c5 Nf3 d6 b4 Nf6 bxc5 Nxe4 cxd6 Qb6 d4 Bg4 ...,26 64 35 53 18 20 18 20 10 49 60 95 91 82 83 9...,26 38 -29 18 -35 2 -2 2 -10 39 11 35 -4 -9 1 1...


We want to stratify our train/test/validation sets based on the length of the games as well as whether white wins. So, we will create bins for the 'short,' 'medium', and 'long' games. The code will create 3 bins, and can be changed to create more bins.

In [37]:
df['numMovesBin'] = pd.qcut(df['NumWhiteMoves'], q=3, labels=[1,2,3])
df.head()
df['numMovesBin'].value_counts()

1    8487
2    8382
3    8103
Name: numMovesBin, dtype: int64

In [38]:
# stratify the data only based on whether white wins
x_train, x_test, y_train, y_test = train_test_split(df, df['WhiteWins'], test_size=0.2, random_state=45, stratify=df[['WhiteWins', 'numMovesBin']])
x_train = x_train.drop('WhiteWins', axis=1)
x_test = x_test.drop('WhiteWins', axis=1)
x_train.head()

Unnamed: 0,WhiteElo,BlackElo,Moves,StockfishScores,StockfishDeltas,NumWhiteMoves,numMovesBin
18383,2401,2001,Nf3 Nf6 c4 c5 Nc3 d6 g3 g6 Bg2 Bg7 d4 cxd4 Nxd...,18 17 12 8 22 46 0 7 21 24 32 34 39 41 39 33 3...,18 -1 -5 -4 14 24 -46 7 14 3 8 2 5 2 -2 -6 6 -...,24,1
11894,1939,2473,d4 g6 Nf3 Bg7 c4 Nf6 Nc3 d5 cxd5 Nxd5 e4 Nxc3 ...,19 77 51 74 53 46 33 65 50 51 52 54 36 47 41 2...,19 58 -26 23 -21 -7 -13 32 -15 1 1 2 -18 11 -6...,60,3
3851,2415,2303,e4 c5 Nf3 d6 c3 Nf6 h3 Nc6 Bd3 d5 e5 Nd7 Bb5 e...,26 64 35 53 18 13 2 -23 -17 -15 -11 -30 -23 -2...,26 38 -29 18 -35 -5 -11 -25 6 2 4 -19 7 3 1 -1...,70,3
24090,2329,2249,e4 e5 d4 exd4 c3 d5 exd5 Qxd5 cxd4 Qe4+ Be3 Nf...,26 45 8 21 -12 -21 -7 9 2 50 41 66 51 37 72 65...,26 19 -37 13 -33 -9 14 16 -7 48 -9 25 -15 -14 ...,62,3
15670,1885,1930,e4 c5 Nf3 e6 d4 cxd4 Nxd4 Nf6 Nc3 d6 Bg5 Be7 Q...,26 64 35 42 38 38 22 33 17 31 20 22 13 4 2 11 ...,26 38 -29 7 -4 0 -16 11 -16 14 -11 2 -9 -9 -2 ...,18,1


In [50]:
type(y_train)
y_train

18383    1
11894   -1
3851     1
24090    0
15670   -1
        ..
20592    0
20293    1
19410    1
20007    1
17932    1
Name: WhiteWins, Length: 19977, dtype: int64

In [55]:
# create a validation set

x_primtrain, x_valid, y_primtrain, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=45, stratify=y_train)
x_primtrain.shape[0], x_valid.shape[0], x_train.shape[0]

(15981, 3996, 19977)

## New Dataframe: Row For Each Move

Now that we have training/test/validation sets per game, we will create new dataframes based on the row for each move. For example, a game that lasts 30 games will have 30 rows, each of which will have information of its game up to the Nth move. We will add a column for them to also include their current move.

In [70]:
from buildChessDFPerMove import buildChessDataframePerMove

x_trainPerMove, y_trainPerMove = buildChessDataframePerMove(x_primtrain, y_primtrain)
#x_testPerMove, y_testPerMove = buildChessDataframePerMove(x_test, y_test)
#x_validPerMove, y_validPerMove = buildChessDataframePerMove(x_valid, y_valid)

A


TypeError: string indices must be integers

In [40]:
# TODO this only works when we have non-string, so we gotta cut that from our data
# Scale our data for training/validation
def scale():
        primitive_scaler = StandardScaler()
        x_primtrain = pd.DataFrame(primitive_scaler.fit_transform(x_primtrain), columns = x_train.columns)
        x_valid = pd.DataFrame(primitive_scaler.transform(x_valid), columns = x_test.columns)
        x_test = pd.DataFrame(primitive_scaler.transform(x_test), columns = x_test.columns)

        x_train.head()

ValueError: could not convert string to float: 'e4 e5 Nf3 Nc6 Bc4 Bc5 c3 Nf6 d3 a6 Bb3 Ba7 O-O h6 h3 d6 Re1 O-O Nbd2 Qe7 Nf1 Be6 Bc2 Nh7 d4 Bd7 Ne3 Ng5 Nxg5 Qxg5 Nd5 Qd8 Qh5 exd4 Bxh6 d3 Bg5 f6 Bb3 Bxf2+ Kh1 Be6 Nxf6+ Rxf6 Bxe6+ Rxe6 Bxd8 Bxe1 Qd5 Nxd8 Rxe1 c6 Qxd3 Nf7 Re3 Rae8 Qd4 c5 Qd5 R8e7 Kh2 Re5 Qc4 b5 Qe2 d5 Qg4 dxe4 Qc8+ Kh7 Qxa6 c4 b3 R7e6 Qb7 Nd6 Qc7 Nf5 Re1 cxb3 axb3 e3 g4 Nh4 Kg3 Ng6 h4 e2 h5 Re3+ Kf2 Ne5 Kxe3 Nxg4+ Kd2'