In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chess/games.csv


In [2]:
import plotly.express as px

from sklearn.model_selection import train_test_split as tts

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
data = pd.read_csv('../input/chess/games.csv')

In [4]:
data

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1.504210e+12,1.504210e+12,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1.504130e+12,1.504130e+12,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1.504130e+12,1.504130e+12,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1.504110e+12,1.504110e+12,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1.504030e+12,1.504030e+12,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20053,EfqH7VVH,True,1.499791e+12,1.499791e+12,24,resign,white,10+10,belcolt,1691,jamboger,1220,d4 f5 e3 e6 Nf3 Nf6 Nc3 b6 Be2 Bb7 O-O Be7 Ne5...,A80,Dutch Defense,2
20054,WSJDhbPl,True,1.499698e+12,1.499699e+12,82,mate,black,10+0,jamboger,1233,farrukhasomiddinov,1196,d4 d6 Bf4 e5 Bg3 Nf6 e3 exd4 exd4 d5 c3 Bd6 Bd...,A41,Queen's Pawn,2
20055,yrAas0Kj,True,1.499698e+12,1.499698e+12,35,mate,white,10+0,jamboger,1219,schaaksmurf3,1286,d4 d5 Bf4 Nc6 e3 Nf6 c3 e6 Nf3 Be7 Bd3 O-O Nbd...,D00,Queen's Pawn Game: Mason Attack,3
20056,b0v4tRyF,True,1.499696e+12,1.499697e+12,109,resign,white,10+0,marcodisogno,1360,jamboger,1227,e4 d6 d4 Nf6 e5 dxe5 dxe5 Qxd1+ Kxd1 Nd5 c4 Nb...,B07,Pirc Defense,4


# Preprocessing

In [5]:
data['winner'].unique()

array(['white', 'black', 'draw'], dtype=object)

In [6]:
moves = np.array(
    data.query("winner != 'draw'")['moves']
)

In [7]:
labels = data.query("winner != 'draw'")['winner'].apply(lambda x:1 if x=='white' else 0)

In [8]:
all_moves = set()
for move_list in moves:
    for move in move_list.split(" "):
        if move not in all_moves:
            all_moves.add(move)
            
max_vocab = len(all_moves)

In [9]:
max_vocab

4373

### Length of the longest sequence moves

In [10]:
max_len = 0
for move_list in moves:
    total = 0
    for move in move_list.split(" "):
        total+=1
    if total > max_len:
        max_len = total
        

In [11]:
max_len

349

## Create input vectors

In [12]:
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts(moves)

sequences = tokenizer.texts_to_sequences(moves)

word_index = tokenizer.word_index


In [13]:
model_inputs  = pad_sequences(sequences,maxlen=max_len)

In [14]:
model_inputs.shape

(19108, 349)

In [15]:
labels.shape

(19108,)

# Training

In [16]:
train_inputs,test_inputs,train_labels,test_labels = tts(model_inputs,labels,train_size=0.7,random_state=24)

In [17]:
embedding_dim = 256
inputs = tf.keras.Input(shape = max_len)
embedding = tf.keras.layers.Embedding(
    input_dim = max_vocab,
    output_dim = embedding_dim,
    input_length = max_len
)(inputs)

gru = tf.keras.layers.GRU(units = embedding_dim)(embedding)

outputs = tf.keras.layers.Dense(1,activation = 'sigmoid')(gru)

model = tf.keras.Model(inputs = inputs,outputs = outputs)

In [18]:
model.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',
    metrics = [
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)



In [19]:
batch_size = 32
epochs = 5

In [20]:
history = model.fit(
    train_inputs,
    train_labels,
    validation_split=0.2,
    batch_size = batch_size,
    epochs = epochs,
    callbacks = [tf.keras.callbacks.ReduceLROnPlateau()]
    
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
fig = px.line(
    history.history,
    y = ['loss','val_loss'],
    labels = {
        "x" : 'Epochs',
        "y" : 'Loss'
    },
    title = "Loss Over Time"
    

)
fig.show()

In [22]:
fig = px.line(
    history.history,
    y = ['auc','val_auc'],
    labels = {
        "x" : 'Epochs',
        "y" : 'Loss'
    },
    title = "AUC Over Time"
    

)
fig.show()

In [23]:
model.evaluate(test_inputs,test_labels)



[0.2964078485965729, 0.8812140226364136, 0.9520260691642761]