# Data Collection and Preprocessing

### Obtaining PGN of past games from players who are rated at expert level or higher using Chess.com API

In [14]:
import json
import pandas as pd

### Using Chess.com API endpoint to get list of usernames of players who are have an Elo rating of an expert level or higher (Titled Players).

- Grandmasters (GM): Usually 2500 or higher
- International Masters (IM): Usually between 2400 and 2500
- FIDE Master (FM): Usually betwenn 2300 and 2400
- FIDE Candidate Master (CM)/ National Master: Ususally between 2200 and 2300
- Expert / National Candidate Master: Between 2000 and 2200


In [19]:

from chessdotcom import get_titled_players

# List of titled player types
titles = ["GM", "IM", "FM", "CM"]

# Dictionary to store the results
titled_players_data = {}

# Loop through each title and get the corresponding players
for title in titles:
    response = get_titled_players(title).json
    titled_players_data[title] = response

# Save the combined data to a JSON file
with open("multiple_titled_players.json", "w") as file:
    json.dump(titled_players_data, file, indent=4)

print(response)


{'players': ['19andi73', '1b31-0', '1c4_1-0', '1gelm9ister', '2007checkmate', '20ofjuly', '21osakat', '2sks', '2typicalchessplayer', '5sight', 'a3aki', 'aanshnerurkar', 'aaponter', 'abbesali', 'abc_xyz4744', 'abelmat', 'abigailcabezas', 'abinesh12345', 'abrab_64', 'abuhanan', 'acertijo08', 'achalachess', 'acollins05', 'actuary44', 'adchek', 'adnan_habib', 'adreyd', 'adrian-thorsen', 'adrian_g19', 'advancedrook', 'advicecabinet', 'aguscm2018', 'agustin_meza', 'ahmedkandiliii', 'ahmedotaleb', 'ajedrez-facil', 'ajedrez1101', 'ajedrezinteligente', 'akanga001', 'akatsukiloghorizon', 'akhairat', 'akhandbharatkijay', 'akinov-akinseye', 'aklan10n', 'albert_kloc', 'alblooshi_hamad', 'albussevrespotter', 'aldebaran52', 'aldobaral56', 'alejanchess73', 'alejandrobalde', 'alejandrocapanegra', 'alejandropacheco29', 'aleksandar-topalov', 'alexandercs432', 'alexandrehouhou', 'alexjr20', 'alexppixe', 'alex_jose_iraeta', 'alex_yang', 'alfiefischer77', 'algatar2020', 'alisuma', 'alivodicdario', 'almaguer

### Creating Data Frame

In [59]:
data_usernames = []
for title, players in titled_players_data.items():
    for player in players:
        data_usernames.append({"Title": title, "Username": players[player]})

df = pd.DataFrame(data_usernames)
print(df.to_string(index=False))

Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Data Frame Analysis

In [66]:
print('General Information:')
print(df.info(verbose=True, ))
print()

# Number of username for each title
df['Username_Count'] = df['Username'].apply(len)
# Group by Title and sum the counts
title_counts = df.groupby('Title')['Username_Count'].sum()
print(title_counts)


General Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           4 non-null      object
 1   Username        4 non-null      object
 2   Username_Count  4 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes
None

Title
CM    1620
FM    3700
GM    1577
IM    2193
Name: Username_Count, dtype: int64
