## Import Packages and Data Set

In [None]:
import chess.pgn
import numpy as np
import pandas as pd
import pickle

In [None]:
games = open("lichess_db_standard_rated_2014-08.pgn", encoding="utf8", errors='ignore')

<br>

## Data Cleaning and Filtering

In [None]:
# Read in each game and format
result = {}
i = 0
while True:
    i += 1
    game = chess.pgn.read_game(games)
    if game is None:
        break

    headers = dict(game.headers)
    headers["moves"] = game.board().variation_san(game.mainline_moves())
    headers["move_num"] = headers["moves"].count('.')

    result["Game{}".format(i)] = headers

In [None]:
# Create data frame of results
results = pd.DataFrame(result)
df = results.transpose()
df.info()

In [None]:
# Create variable for winner
df['winner'] = np.where(df['Result'] == '1-0', 'white', np.where(df['Result'] == '0-1', 'black', 'draw'))

In [None]:
# Remove games that were a draw
chessdf = df[df['winner']!='draw']
chessdf.head()

In [None]:
# Save copy of Aug 2014 data
with open('2014_08_chess_data.pickle', 'wb') as to_write:
        pickle.dump(chessdf, to_write)

In [None]:
# Remove games with unknown Elo ratings
has_white_elo = chessdf[chessdf['WhiteElo'] != '?']
has_black_elo = has_white_elo[has_white_elo['BlackElo'] != '?']

In [None]:
# Restrict data to just necessary columns
chess_games = has_black_elo[['BlackElo', 'WhiteElo', 'moves', 'move_num', 'winner']]
chess_games.head()

In [None]:
# Create int variables for Elo ratings 
chess_games['Black_Elo_Num'] = chess_games['BlackElo'].astype(int)
chess_games['White_Elo_Num'] = chess_games['WhiteElo'].astype(int)
chess_games.info()

In [None]:
# Find scores in top 0.1%
chess_games.quantile(.999, axis = 0) 
# Rounded up to 2400

In [None]:
# Restrict data to only players with 2400 or above Elo rating
over_2400 = chess_games[(chess_games['White_Elo_Num'] >= 2400) & (chess_games['Black_Elo_Num'] >= 2400)]
over_2400.info()

In [None]:
# Pickle data set
with open('2014_08_over_2400_chess_data.pickle', 'wb') as to_write:
        pickle.dump(over_2400, to_write)

In [None]:
# Save data set to csv
over_2400.to_csv('chess_0814_over_2400.csv')