# Data Collection and Preprocessing

### Obtaining PGN of past games from players who are rated at expert level or higher using Chess.com API

In [76]:
import json
import pandas as pd

### Using Chess.com API endpoint to get list of usernames of players who are have an Elo rating of an expert level or higher (Titled Players).

- Grandmasters (GM): Usually 2500 or higher
- International Masters (IM): Usually between 2400 and 2500
- FIDE Master (FM): Usually betwenn 2300 and 2400
- FIDE Candidate Master (CM)/ National Master: Ususally between 2200 and 2300
- Expert / National Candidate Master: Between 2000 and 2200


In [77]:

from chessdotcom import get_titled_players

# List of titled player types
titles = ["GM", "IM", "FM", "CM"]

titled_players_data = {}

# Loop through each title and get the corresponding players
for title in titles:
    response = get_titled_players(title).json
    titled_players_data[title] = response

with open("multiple_titled_players.json", "w") as file:
    json.dump(titled_players_data, file, indent=4)

print(response)


{'players': ['19andi73', '1b31-0', '1c4_1-0', '1gelm9ister', '2007checkmate', '20ofjuly', '21osakat', '2sks', '2typicalchessplayer', '5sight', 'a3aki', 'aanshnerurkar', 'aaponter', 'abbesali', 'abc_xyz4744', 'abelmat', 'abigailcabezas', 'abinesh12345', 'abrab_64', 'abuhanan', 'acertijo08', 'achalachess', 'acollins05', 'actuary44', 'adchek', 'adnan_habib', 'adreyd', 'adrian-thorsen', 'adrian_g19', 'advancedrook', 'advicecabinet', 'aguscm2018', 'agustin_meza', 'ahmedkandiliii', 'ahmedotaleb', 'ajedrez-facil', 'ajedrez1101', 'ajedrezinteligente', 'akanga001', 'akatsukiloghorizon', 'akhairat', 'akhandbharatkijay', 'akinov-akinseye', 'aklan10n', 'albert_kloc', 'alblooshi_hamad', 'albussevrespotter', 'aldebaran52', 'aldobaral56', 'alejanchess73', 'alejandrobalde', 'alejandrocapanegra', 'alejandropacheco29', 'aleksandar-topalov', 'alexandercs432', 'alexandrehouhou', 'alexjr20', 'alexppixe', 'alex_jose_iraeta', 'alex_yang', 'alfiefischer77', 'algatar2020', 'alisuma', 'alivodicdario', 'almaguer

### Creating Data Frame

In [78]:
data_usernames = []
for title, players in titled_players_data.items():
    for player in players:
        data_usernames.append({"Title": title, "Username": players[player]})

df = pd.DataFrame(data_usernames)
print(df.to_string(index=False))

Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Data Frame Analysis

In [79]:
print('General Information:')
print(df.info(verbose=True, ))
print()

# Number of username for each title
df['Username_Count'] = df['Username'].apply(len)
# Group by Title and sum the counts
title_counts = df.groupby('Title')['Username_Count'].sum()
print(title_counts)
print()

# Total number of usernames
total_usernames = title_counts.sum()
print(f'Total number of usernames: {total_usernames}')


General Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     4 non-null      object
 1   Username  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes
None

Title
CM    1620
FM    3700
GM    1577
IM    2193
Name: Username_Count, dtype: int64

Total number of usernames: 9090


### Return object containing a list of live and daily Chess games that a player has finished. (Titled GM Players)

In [124]:
from chessdotcom import get_player_games_by_month

player_games_by_month = {}

for title, players in titled_players_data["GM"].items():
    for player in players:
        response = get_player_games_by_month(player, year=2024, month=8).json
        player_games_by_month[player] = response

with open("player_games_by_month_GM.json", "w") as file:
    json.dump(player_games_by_month, file, indent=4)

print(response)

     

{'games': [{'url': 'https://www.chess.com/game/live/116984372449', 'pgn': '[Event "Live Chess"]\n[Site "Chess.com"]\n[Date "2024.08.09"]\n[Round "-"]\n[White "lucliasco"]\n[Black "zvonokchess1996"]\n[Result "1-0"]\n[CurrentPosition "r3k2r/pp1b1p2/3Pp2p/4Nnp1/4N1P1/P7/1PP2P1P/2KR3R b kq -"]\n[Timezone "UTC"]\n[ECO "A46"]\n[ECOUrl "https://www.chess.com/openings/Indian-Game-Spielmann-Indian-Variation"]\n[UTCDate "2024.08.09"]\n[UTCTime "23:13:09"]\n[WhiteElo "2851"]\n[BlackElo "3004"]\n[TimeControl "60"]\n[Termination "lucliasco won by resignation"]\n[StartTime "23:13:09"]\n[EndDate "2024.08.09"]\n[EndTime "23:14:19"]\n[Link "https://www.chess.com/game/live/116984372449"]\n\n1. d4 {[%clk 0:00:59.9]} 1... Nf6 {[%clk 0:00:58.5]} 2. Nf3 {[%clk 0:00:59.8]} 2... c5 {[%clk 0:00:58]} 3. Bg5 {[%clk 0:00:59.4]} 3... Ne4 {[%clk 0:00:56.4]} 4. Bf4 {[%clk 0:00:57.4]} 4... cxd4 {[%clk 0:00:55.6]} 5. Qxd4 {[%clk 0:00:56.4]} 5... Nf6 {[%clk 0:00:54.7]} 6. Bg5 {[%clk 0:00:56]} 6... Nc6 {[%clk 0:00:53.8]

### JSON Cleaning: 
- Removing all unnecessary fields from data collected through player games by month
- Using a regex filter to clean up 'pgn' field to only show the moves played.
- Only take game information for Blitz time class
- Removed player of they have no game information

In [133]:
import re

def clean_pgn(pgn):
    # Use regex to find the first occurrence of "1." and return the substring from there
    match = re.search(r'1\..*', pgn, re.DOTALL)
    return match.group(0) if match else pgn

with open('player_games_by_month_GM.json', 'r') as file:
    data = json.load(file)

cleaned_data = {}

# Iterate through the data and filter out non-blitz games
for player, details in data.items():
    cleaned_data[player] = {
        "games": []
    }
    if "games" in details and details["games"]:
        for game in details["games"]:
            if game["time_class"] == "blitz":
                # Remove unnecessary fields and clean the pgn field
                cleaned_game = {
                    "pgn": clean_pgn(game.get("pgn", "")),
                    "initial_setup": game.get("initial_setup"),
                    "fen": game.get("fen"),
                    "white": game.get("white"),
                    "black": game.get("black")
                }
                cleaned_data[player]["games"].append(cleaned_game)
    
    # Remove player if they have no games
    if not cleaned_data[player]["games"]:
        del cleaned_data[player]

with open('cleaned_player_games_by_month.json', 'w') as file:
    json.dump(cleaned_data, file, indent=4)