In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline

import gc

pd.set_option('display.max_columns', 200)

Populating the interactive namespace from numpy and matplotlib


In [3]:
BASE_PATH = "../Bases/"

In [4]:
data = pd.read_csv(BASE_PATH + 'games.csv', delimiter=',')

In [5]:
data.shape

(20058, 16)

In [7]:
data.head(3)

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3


In [17]:
data.dtypes

id                 object
rated                bool
created_at        float64
last_move_at      float64
turns               int64
victory_status     object
winner             object
increment_code     object
white_id           object
white_rating        int64
black_id           object
black_rating        int64
moves              object
opening_eco        object
opening_name       object
opening_ply         int64
dtype: object

In [18]:
X = data.drop(columns=["winner"])
y = data["winner"]

In [20]:
del data
gc.collect()

99

Opening names

In [23]:
opening_names = X[["opening_eco", "opening_name"]].drop_duplicates()

Save matches ids

In [25]:
ids = X["id"].copy()

## Feature Engineering

Here we:
- Do the imputations (not needed here, as we have no null values in the columns)
- Create new features based on the dataset we have

In [135]:
class FeatureEngineering:
    '''
    basic callable methods
    '''
    def __init__(self):
        self.imput_transform = None
        self.create_transform = None
    
    def fit_transform(self, X, y):
        # Fit the ColumnTransformers
        X = self.drop_useless_features(X)
        #X, y = self.imput_nulls(X, y)
        
        # Transform the features
        X, y = self.transform(X, y)
        return X, y
    
    def transform(self, X, y=None):
        X = self.create_features(X)
        return X, y
    
    '''
    auxiliary methods
    '''
    def drop_useless_features(X):
        cols_to_drop = [
            'id',
            'white_id',
            'black_id',
            'opening_name'
        ]
        return X.drop(columns=cols_to_drop)
    
    def imput_nulls(X, y):
        numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
        
        t_imput = [
            ('num', SimpleImputer(strategy='median'), numerical_ix),
            ('cat', SimpleImputer(strategy='most_frequent'), categorical_ix)
        ]
        self.imput_transform = ColumnTransformer(transformers=t_imput)
        return self.imput_transform.fit_transform(X, y)
    
    def create_features(X, y):
        t_create = [
            ('cat', OneHotEncoder(use_cat_names=True), ['victory_status', 'opening_eco']),
            #('num', MinMaxScaler(), numerical_ix)
        ]
        self.create_transform = ColumnTransformer(transformers=t_create)
        X, y = self.create_transform.fit_transform(X, y)
        
        X['created_at'] = pd.to_datetime(X['created_at'], unit='ms')
        X['last_move_at'] = pd.to_datetime(X['last_move_at'], unit='ms')
        X['duration_in_seconds'] = (X['last_move_at'] - X['created_at']) / 1000
        X['increment_code_minutes'] = X['increment_code'].str.split('+').map(lambda time_control: time_control[0], na_action=None).astype(int)
        X['increment_code_seconds'] = X['increment_code'].str.split('+').map(lambda time_control: time_control[1], na_action=None).astype(int)
        X['rating_difference'] = X['white_rating'] - X['black_rating']
        X = self.create_moves_features(period=3, end=15, X)
        X.drop(columns=['increment_code'], axis=1, inplace=True)
        return X
    
    def create_moves_features(period, end, X):
        pieces = ["R", "N", "B", "Q", "K"]
        board_columns = ["a", "b", "c", "d", "e", "f", "g", "h"]

        moves = match.str.split(" ").to_list()[0]
        moves_by_color = {
            "white": moves[::2],
            "black": moves[1::2]
        }
        
        for move_count in range(1, end, period):
            moves_by_color_until_move_count = {
                "white": white_moves[0:move_count],
                "black": black_moves[0:move_count]
            }
            
            for color in ["white", "black"]:
                X['#P_moves_{}_until_move{}'.format(color, move_count)] =  len([move for move in moves_by_color_until_move_count[color] if move.startswith(tuple(board_columns))])
                X['#center_P_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if move.startswith(tuple(board_columns[3:5]))]) # 'd', 'e'
                X['#R_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if move.startswith('R')])
                X['#N_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if move.startswith('N')])
                X['#B_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if move.startswith('B')])
                X['#Q_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if move.startswith('Q')])
                X['#K_moves_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if (move.startswith('K') | move.startswith('O'))]) # "O" => castle
                X['castled_kingside_{}_until_move{}'.format(color, move)] = (len([move for move in moves_by_color_until_move_count[color] if move.startswith('O-O')]) == 1)
                X['castled_queenside_{}_until_move{}'.format(color, move)] = (len([move for move in moves_by_color_until_move_count[color] if move.startswith('O-O-O')]) == 1)
                X['castled_{}_until_move{}'.format(color, move)] = (X['castled_queenside_{}_until_move{}'.format(color, move)] | X['castled_kingside_{}_until_move{}'.format(color, move)])
                X['#takes_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if 'x' in move])
                X['#checks_{}_until_move{}'.format(color, move)] = len([move for move in moves_by_color_until_move_count[color] if '+' in move])
                final_moves_columns = pd.Series(moves_by_color_until_move_count[color]).str.split('#').apply(lambda x: x[0]).str.split('+').apply(lambda x: x[0]).apply(lambda x: x[-2]).to_list()
                X['#king_sided_moves_{}_until_move{}'.format(color, move)] = len([col for col in final_moves_columns if col in board_columns[4:]]) + (X['castled_kingside_{}_until_move{}'.format(color, move)] == 1).astype(np.int64)
                X['#queen_sided_moves_{}_until_move{}'.format(color, move)] = len([col for col in final_moves_columns if col in board_columns[:4]]) + (X['castled_queenside_{}_until_move{}'.format(color, move)] == 1).astype(np.int64)
                X['#centered_moves_{}_until_move{}'.format(color, move)] = len([col for col in final_moves_columns if col in board_columns[2:6]])
                final_moves_rows = pd.Series(moves_by_color_until_move_count[color]).str.split('#').apply(lambda x: x[0]).str.split('+').apply(lambda x: x[0]).apply(lambda x: x[-1]).astype(np.int64).to_list()
                if color == "white":
                    X['#upper_sided_moves_{}_until_move{}'.format(color, move)] = len([row for row in final_moves_rows if row in range(5,9)])
                    X['#down_sided_moves_{}_until_move{}'.format(color, move)] = len([row for row in final_moves_rows if row in range(1,5)]) + (X['castled_{}_until_move{}'.format(color, move)] == 1).astype(np.int64)
                else:
                    X['#upper_sided_moves_{}_until_move{}'.format(color, move)] = len([row for row in final_moves_rows if row in range(1,5)])
                    X['#down_sided_moves_{}_until_move{}'.format(color, move)] = len([row for row in final_moves_rows if row in range(5,9)]) + (X['castled_{}_until_move{}'.format(color, move)] == 1).astype(np.int64)
                X['#middle_moves_{}_until_move{}'.format(color, move)] = len([row for row in final_moves_rows if row in range(3,7)]) 
        return X
    
'''
- #P_moves{_white_3}
- #center_P_moves
- #R_moves
- #N_moves
- #B_moves
- #Q_moves
- #K_moves (castle included)
- queenside_castled
- kingside_castled
- castled
- #takes
- #checks
- #queen_sided_moves (moves between 'a' and 'd' files (included))
- #king_sided_moves (moves between 'e' and 'h' files (included))
- #centered_moves (moves between 'c' and 'f' files (included)) 
'''
        
# When modeling:
### in practice, we won't have the fields 'opening_ply', 'last_move_at' and 'turns' completely in a real match. We should have values for these variables in intermediate time of the match
### we should drop 'created_at', once we have created 'games_delay_in_sec'
### we should drop 'black_rating', once we have created 'rating_difference'
### OneHotEncoder should drop_invariant=True
### attention: 'victory_status' == 'outoftime' => may be a draw or a victory

SyntaxError: invalid syntax (<ipython-input-135-d2ac4faa683d>, line 101)

In [134]:
x = pd.Series(['Nfxf6+', 'Nf6+']).str.split('#').apply(lambda x: x[0]).str.split('+').apply(lambda x: x[0]).apply(lambda x: x[-1]).astype(np.int64).to_list()
print(x)
len([col for col in x if col in ['e', 'f', 'g']])

[6, 6]


0

In [139]:
row = 6
for row in range(5,9):
    print(row)

5
6
7
8


In [138]:
print(range(5,9))

range(5, 9)


In [130]:
board_columns = ["a", "b", "c", "d", "e", "f", "g", "h"]
tuple(board_columns[2:6])

('c', 'd', 'e', 'f')

In [90]:
'Nf6+'.split('#')[0].split('+')[0][-2]

'f'

In [31]:
match = X.sample(1, random_state=42)["moves"]

In [66]:
moves = match.str.split(" ").to_list()[0]
white_moves = turns[::2]
black_moves = turns[1::2]

In [75]:
white_moves

['d4',
 'Bf4',
 'e3',
 'Nc3',
 'Bc4',
 'dxe5',
 'Qxd8+',
 'O-O-O+',
 'Bg5',
 'Nd5+',
 'Nxf6',
 'Bxf6',
 'Bxg8',
 'Nf3',
 'Bh4',
 'a3',
 'h3',
 'axb4',
 'Bg3',
 'Rd2',
 'Rhd1',
 'exf4',
 'Bh4',
 'Ng5+',
 'Ne4+',
 'Nxd6',
 'Rxd6',
 'Rxc6',
 'Kxb2',
 'Kc3',
 'Kb4']

In [80]:
'x' in white_moves[5]

True

In [61]:
for move_count in range(1, 15, 3):
    print(move_count)

1
4
7
10
13


In [62]:
for color in ["white", "black"]:
    print(color)

white
black


In [59]:
pieces = ["R", "N", "B", "Q", "K"]
columns = ["a", "b", "c", "d", "e", "f", "g", "h"]

moves = match.str.split(" ").to_list()[0]
white_moves = turns[::2]
black_moves = turns[1::2]

# move 3
white_move3 = white_moves[0:3]
print(white_move3)
P_moves_white_move3 = [move for move in white_move3 if move.startswith(tuple(columns))]
qtt_P_moves_white_move3 = len(P_moves_white_move3)

'''
por cor, por periodo
- #P_moves{_white_3}
- #center_P_moves
- #R_moves
- #N_moves
- #B_moves
- #Q_moves
- #K_moves (castle included)
- queenside_castled
- kingside_castled
- castled
- #takes
- #checks
- #queen_sided_moves (moves between 'a' and 'd' files (included))
- #king_sided_moves (moves between 'e' and 'h' files (included))
- #centered_moves (moves between 'c' and 'f' files (included))

Suggestions: Change notation to PxC ==> take into account the material
'''

['d4', 'Bf4', 'e3']
['d4', 'e3']


"\npor cor, por periodo\n- #P_moves{_white_3}\n- #center_P_moves\n- #R_moves\n- #N_moves\n- #B_moves\n- #Q_moves\n- #K_moves (castle included)\n- queenside_castled\n- kingside_castled\n- castled\n- #takes\n- #checks\n- #queen_sided_moves (moves between 'a' and 'd' files (included))\n- #king_sided_moves (moves between 'e' and 'h' files (included))\n- #centered_moves (moves between 'c' and 'f' files (included))\n\nSuggestions: Change notation to PxC ==> take into account the material\n"

In [None]:
'''        
rated                bool
created_at        float64
last_move_at      float64
turns               int64
victory_status     object
increment_code     object
white_rating        int64
black_rating        int64
moves              object
opening_eco        object
opening_ply         int64

winner             object
'''

In [14]:
fe = FeatureEngineering()