In [123]:
import pandas as pd 
import numpy as np
import xml.etree.ElementTree as ET
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
import re

In [151]:

CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
INPUT_FILE = 'players_list_xml_foa.xml'
OUTPUT_FILE = 'updated_titled_otb_df.csv'

In [202]:
def init_driver():
    global driver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # service = Service(ChromeDriverManager().install())
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

init_driver()

In [None]:
file_path = INPUT_FILE

tree = ET.parse(file_path)
root = tree.getroot()

data = []
for player in root.findall('player'):
    player_data = {
        "fideid": player.find("fideid").text,
        "name": player.find("name").text,
        "country": player.find("country").text,
        "sex": player.find("sex").text,
        "title": player.find("title").text,
        "w_title": player.find("w_title").text,
        "o_title": player.find("o_title").text,
        "foa_title": player.find("foa_title").text,
        "standart_rating": player.find("rating").text,
        "rapid_rating": player.find("rapid_rating").text,
        "blitz_rating": player.find("blitz_rating").text,
        "birthday": player.find("birthday").text,
        "flag": player.find("flag").text
    }
    data.append(player_data)

df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.fillna({"title": "N/A", "w_title": "N/A", "o_title": "N/A", "foa_title": "N/A", "flag": "N/A"}, inplace=True)

In [None]:
df.to_csv("fide_players.csv", index=False)

In [None]:
df = pd.read_csv("fide_players.csv", keep_default_na=False, index_col=False)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
titled_otb_df = df[
    (df['title'].ne("N/A")) |
    (df['w_title'].ne("N/A"))
]

In [None]:
titled_otb_df.shape

In [None]:
titled_otb_df.head()

In [None]:
titled_otb_df.to_csv("titled_fide_players_unscraped.csv", index=False)

In [23]:
titled_otb_df = pd.read_csv("titled_fide_players_unscraped.csv", keep_default_na=False, index_col=False)

In [24]:
titled_otb_df = titled_otb_df.copy()

unique_titles = titled_otb_df.loc[titled_otb_df['title'] != "N/A", 'title'].dropna().unique().tolist()
unique_w_titles = titled_otb_df.loc[titled_otb_df['w_title'] != "N/A", 'w_title'].dropna().unique().tolist()

all_unique_titles = unique_titles + unique_w_titles

titled_otb_df['is_scraped'] = False

for title in all_unique_titles:
    titled_otb_df.loc[:, title] = titled_otb_df['title'].eq(title) | titled_otb_df['w_title'].eq(title)
    titled_otb_df.loc[:, f"{title}_year"] = 0  

In [196]:
def extract_short_title(title):
    match = re.search(r'\((.*?)\)', title)
    return match.group(1) if match else title.strip()

def scrape_player(fide_id, scrape_game_results=False):
    titles = {}
    game_results = {
        "white_results": {"win": 0, "draw": 0, "loss": 0},
        "black_results": {"win": 0, "draw": 0, "loss": 0},
    }

    try:
        profile_url = f"https://ratings.fide.com/profile/{fide_id}"
        driver.get(profile_url)
        titles_table = driver.find_element(By.XPATH, "//th[text()='Titles']/ancestor::table")
        titles_rows = titles_table.find_elements(By.XPATH, ".//tr")[1:]
        for row in titles_rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) == 2:
                full_title = columns[0].text.strip()
                year = columns[1].text.strip()
                # print(full_title, year)
                short_title = extract_short_title(full_title)
                titles[short_title] = year
    except Exception:
        pass  

    if scrape_game_results:
        retry_attempts = 3
        for attempt in range(retry_attempts):
            try:
                stats_url = f"https://ratings.fide.com/profile/{fide_id}/statistics"
                driver.get(stats_url)
                driver.implicitly_wait(100)
                
                charts_data = driver.execute_script(
                    """
                    var charts = Chart.instances;
                    return [0, 1].map(index => {
                        var chart = charts[index];
                        return {
                            labels: chart.data.labels,
                            datasets: chart.data.datasets.map(dataset => dataset.data)
                        };
                    });
                    """
                )

                white_stats = charts_data[0]
                black_stats = charts_data[1]

                if all(value == 0 for value in white_stats["datasets"][0]) and \
                   all(value == 0 for value in black_stats["datasets"][0]):
                    if attempt < retry_attempts - 1:
                        print(f"Retrying ({attempt + 1}) for FIDE ID: {fide_id}")
                        continue  

                for label, value in zip(white_stats["labels"], white_stats["datasets"][0]):
                    if "Win" in label:
                        game_results["white_results"]["win"] = int(value)
                    elif "Draw" in label:
                        game_results["white_results"]["draw"] = int(value)
                    elif "Loss" in label:
                        game_results["white_results"]["loss"] = int(value)

                for label, value in zip(black_stats["labels"], black_stats["datasets"][0]):
                    if "Win" in label:
                        game_results["black_results"]["win"] = int(value)
                    elif "Draw" in label:
                        game_results["black_results"]["draw"] = int(value)
                    elif "Loss" in label:
                        game_results["black_results"]["loss"] = int(value)

                break 
            except Exception:
                if attempt == retry_attempts - 1:
                    pass  

    return {"fide_id": fide_id, "titles": titles, "game_results": game_results}

def scrape_players_batch(fide_ids, scrape_game_results=True):
    scraped_data = []
    for fide_id in tqdm(fide_ids, desc="Scraping Players [Batch]"):
        player_data = scrape_player(fide_id, scrape_game_results)
        scraped_data.append(player_data)
    return scraped_data

def process_scraped_data(df, scraped_data):
    for record in scraped_data:
        fide_id = record["fide_id"]
        titles = record["titles"]
        print(titles)
        game_results = record["game_results"]

        for title, year in titles.items():
            title_col = f"{title}_year"
            try:
                year = int(year) if year.strip().isdigit() else 0
                print("year = ", year)
            except ValueError:
                year = 0

            if title_col not in df.columns:
                df[title_col] = pd.NA
            df.loc[df["fideid"] == fide_id, title_col] = year
            df.loc[df["fideid"] == fide_id, title] = True
            print(df.loc[df["fideid"] == fide_id, title_col])

        df.loc[df["fideid"] == fide_id, "white_win"] = game_results["white_results"]["win"]
        df.loc[df["fideid"] == fide_id, "white_draw"] = game_results["white_results"]["draw"]
        df.loc[df["fideid"] == fide_id, "white_loss"] = game_results["white_results"]["loss"]
        df.loc[df["fideid"] == fide_id, "black_win"] = game_results["black_results"]["win"]
        df.loc[df["fideid"] == fide_id, "black_draw"] = game_results["black_results"]["draw"]
        df.loc[df["fideid"] == fide_id, "black_loss"] = game_results["black_results"]["loss"]

        df.loc[df["fideid"] == fide_id, "is_scraped"] = True



def start_scraping_process(df, start_row=0, scrape_game_results=True, reset=False, scrape_inconsistent_only=False, fide_ids=None):
    if fide_ids:
        rows_to_scrape = df[df["fideid"].isin(fide_ids)]
    elif reset:
        df["is_scraped"] = False  
        rows_to_scrape = df
    elif scrape_inconsistent_only:
        rows_to_scrape = df[(df["is_inconsistent"] == True)]
    else:
        rows_to_scrape = df[(df.index >= start_row) & (df["is_scraped"] == False)]

    fide_ids = rows_to_scrape["fideid"].tolist()
    total_records = len(fide_ids)

    print(f"Total records to scrape: {total_records}")

    if total_records == 0:
        print("No records to scrape.")
        return

    try:
        with tqdm(total=total_records, desc="Scraping Players [Batch]") as pbar:
            batch_size = 50
            for i in range(0, len(fide_ids), batch_size):
                batch_fide_ids = fide_ids[i:i + batch_size]
                scraped_data = scrape_players_batch(batch_fide_ids, scrape_game_results)
                process_scraped_data(df, scraped_data)
                pbar.update(len(batch_fide_ids))
    finally:
        driver.quit()


In [203]:
start_scraping_process(titled_otb_df, scrape_game_results=False, reset=False, scrape_inconsistent_only=True, fide_ids=[1407589, 25678191])

Total records to scrape: 2


Scraping Players [Batch]: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]
Scraping Players [Batch]: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]

{'IM': '2016', 'FM': '2009'}
year =  2016
1    2016
Name: IM_year, dtype: object
year =  2009
1    2009
Name: FM_year, dtype: object
{'IM': '2023', 'CM': '2020'}
year =  2023
2    2023
Name: IM_year, dtype: object
year =  2020
2    2020
Name: CM_year, dtype: object





In [97]:
def find_inconsistent_titles(df, flag=True):
    titles = ["IM", "WFM", "WCM", "GM", "WIM", "FM", "CM", "WGM", "WH"]
    
    for title in titles:
        year_column = f"{title}_year"
        if year_column in df.columns:
            df[year_column] = pd.to_numeric(df[year_column], errors="coerce").fillna(0).astype(int)

    if flag:
        df["is_inconsistent"] = False  

    inconsistent_records = []

    for title in titles:
        year_column = f"{title}_year"

        if year_column in df.columns:
            mask_true_year_zero = (df[title]) & (df[year_column] == 0)
            if flag:
                df.loc[mask_true_year_zero, "is_inconsistent"] = True
            else:
                inconsistent_records.append(df[mask_true_year_zero])

            mask_false_valid_year = (df[title] == False) & (df[year_column] > 0)
            if flag:
                df.loc[mask_false_valid_year, "is_inconsistent"] = True
            else:
                inconsistent_records.append(df[mask_false_valid_year])

    if flag:
        return df 
    else:
        return pd.concat(inconsistent_records).drop_duplicates().reset_index(drop=True) if inconsistent_records else pd.DataFrame()


In [105]:
titled_otb_df = find_inconsistent_titles(titled_otb_df,flag=True)

In [40]:
titled_otb_df.shape

(22557, 38)

In [199]:
scrape_player("1407589")

{'fide_id': '1407589',
 'titles': {},
 'game_results': {'white_results': {'win': 0, 'draw': 0, 'loss': 0},
  'black_results': {'win': 0, 'draw': 0, 'loss': 0}}}

In [106]:
titled_otb_df[titled_otb_df["is_inconsistent"]==True]

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,standart_rating,rapid_rating,blitz_rating,birthday,flag,is_scraped,IM,IM_year,WFM,WFM_year,WCM,WCM_year,GM,GM_year,WIM,WIM_year,FM,FM_year,CM,CM_year,WGM,WGM_year,WH,WH_year,white_win,white_draw,white_loss,black_win,black_draw,black_loss,is_inconsistent
1,1407589,"Aabling-Thomsen, Jakob",DEN,M,IM,,,,2327,0,0,1985,,True,True,2016,False,0,False,0,False,0,False,0,False,2009,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,25678191,Aaditya Dhingra,IND,M,IM,,,,2421,2208,2268,2006,,True,True,2023,False,0,False,0,False,0,False,0,False,0,False,2020,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
5,1401815,"Aagaard, Jacob",DEN,M,GM,,,,2435,2502,2509,1973,,True,False,1997,False,0,False,0,True,2007,False,0,False,0,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
7,25644394,Aakash G,IND,M,FM,,,,2196,2139,1964,2010,,True,False,0,False,0,False,0,False,0,False,0,True,2024,False,2024,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
12,45059756,Aarav Dengla,IND,M,IM,,,,2366,2136,2290,2009,,True,True,2024,False,0,False,0,False,0,False,0,False,2022,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22552,24150789,"Zykina, Nadezhda",RUS,F,WFM,WFM,,,2148,2050,2055,1956,wi,True,False,0,True,0,False,0,False,0,False,0,False,0,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
22553,1102338,"Zyla, Janusz",POL,M,FM,,,,2181,2123,2182,1956,i,True,False,0,False,0,False,0,False,0,False,0,True,0,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
22554,1189654,"Zylka, Stanislaw",POL,M,IM,,,,2357,2281,2291,1999,,True,True,0,False,0,False,0,False,0,False,0,False,0,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True
22555,4700295,"Zymberi, Astrit",KOS,M,FM,,FI,,2059,2117,0,1974,,True,False,0,False,0,False,0,False,0,False,0,True,0,False,0,False,0,False,0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [None]:
titled_otb_df[titled_otb_df["is_scraped"] == True].shape

In [None]:
titled_otb_df[titled_otb_df["is_scraped"] == False].shape

In [9]:
titled_otb_df_final = titled_otb_df.drop(columns=["white_win",	"white_draw",	"white_loss",	"black_win",	"black_draw",	"black_loss", "is_scraped"]).copy()

In [10]:
pd.set_option('display.max_columns', None)
titled_otb_df_final.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,standart_rating,rapid_rating,blitz_rating,birthday,flag,IM,IM_year,WFM,WFM_year,WCM,WCM_year,GM,GM_year,WIM,WIM_year,FM,FM_year,CM,CM_year,WGM,WGM_year,WH,WH_year
0,1701991,"Aaberg, Anton",SWE,M,IM,,,,2322,2331,0,1972,,True,2013,False,0,False,0,False,0,False,0,False,0,False,0,False,0,False,0
1,1407589,"Aabling-Thomsen, Jakob",DEN,M,IM,,,,2327,0,0,1985,,True,2016,False,0,False,0,False,0,False,0,False,2009,False,0,False,0,False,0
2,25678191,Aaditya Dhingra,IND,M,IM,,,,2421,2208,2268,2006,,True,2023,False,0,False,0,False,0,False,0,False,0,False,2020,False,0,False,0
3,25778293,Aadya Gupta,IND,F,WFM,WFM,,,1968,1748,1888,2010,w,False,0,True,2024,False,0,False,0,False,0,False,0,False,0,False,0,False,0
4,25991426,Aadya Ranganath,IND,F,WCM,WCM,,,1900,1704,1747,2013,w,False,0,False,0,True,2025,False,0,False,0,False,0,False,0,False,0,False,0


In [11]:
titled_otb_df_final.to_csv(OUTPUT_FILE, index=False)