In [1]:
import json
import sys
import xml.etree.ElementTree as ET

import pandas as pd
from tqdm import tqdm

PATH_TO_UTILS = "../../"
sys.path.append(PATH_TO_UTILS)

from utils import check_mf_formatting

In [5]:
f = open("wikidata_data_dobs.json", "r")
wikidata_players_list = json.load(f)
f.close()

In [6]:
wikidata_players_list[0]

{'item': 'http://www.wikidata.org/entity/Q66850',
 'itemLabel': 'David Baramidze',
 'fideID': '4667719',
 'dateOfBirth': '1988-09-27T00:00:00Z',
 'dobStatementGUID': 'q66850-0808D72D-7D0F-4F17-814F-F4D77C633ACE'}

In [7]:
def xml_to_dict(xml_file):
    # Initialize an empty dictionary to store player data.
    players_data = {}

    # Initialize variables to keep track of current player attributes.
    current_player = {}

    # Parse the XML file incrementally with iterparse.
    context = ET.iterparse(xml_file, events=("start", "end"))

    # Wrap the context with tqdm to create a progress bar.
    context = tqdm(context, desc="Parsing XML", unit="elements", leave=False)

    for event, elem in context:
        if event == "start" and elem.tag == "player":
            # Start of a player element, reset current_player dictionary.
            current_player = {}

        elif event == "end" and elem.tag == "player":
            # End of a player element, add current_player to players_data dictionary.
            player_id = current_player.get("fideid")
            if player_id:
                players_data[player_id] = current_player

        elif event == "end":
            # Process the end of an XML element inside a player element.
            current_player[elem.tag] = (
                elem.text.strip() if elem.text is not None else None
            )
            elem.clear()

    # Close the XML file.
    del context

    return players_data


xml_file = "players_list_xml_foa.xml"
fide_dict = xml_to_dict(xml_file)

                                                           

In [8]:
len(fide_dict)

1390912

In [9]:
fide_dict["1503014"]

{'fideid': '1503014',
 'name': 'Carlsen, Magnus',
 'country': 'NOR',
 'sex': 'M',
 'title': 'GM',
 'w_title': None,
 'o_title': None,
 'foa_title': None,
 'rating': '2830',
 'games': '0',
 'k': '10',
 'rapid_rating': '2823',
 'rapid_games': '0',
 'rapid_k': '10',
 'blitz_rating': '2886',
 'blitz_games': '0',
 'blitz_k': '10',
 'birthday': '1990',
 'flag': None}

In [10]:
wikidata_players_list[90]

{'item': 'http://www.wikidata.org/entity/Q98881',
 'itemLabel': 'Reinhart Fuchs',
 'fideID': '4611357',
 'dateOfBirth': '1934-09-28T00:00:00Z',
 'dobStatementGUID': 'Q98881-540E8F90-B4F0-4222-AD2E-958E7394F8B7'}

In [11]:
count = 0
f = open("invalid_fide.txt", "w")
f1 = open("incomplete_data.txt", "w")
lst = []
lst2 = []
for e in wikidata_players_list:
    try:
        if e["dateOfBirth"][:4] != fide_dict[e["fideID"]]["birthday"]:

            if fide_dict[e["fideID"]]["birthday"] is None:
                f1.write("Birthday not found on FIDE for FIDE id %s\n" % e["fideID"])
                f1.write("WIKIDATA: %s\n" % e["item"])
                continue
            external_url = "https://ratings.fide.com/profile/" + e["fideID"]
            row = {
                "item_id": e["item"][31:],
                "property_id": "P569",
                "statement_guid": e["dobStatementGUID"],
                "wikidata_value": e["dateOfBirth"],
                "meta_wikidata_value": "Q1985727",
                "external_value": fide_dict[e["fideID"]]["birthday"],
                "external_url": external_url,
                "type": "statement",
            }
            # df = df.append(row, ignore_index = True).
            lst.append(row)
            count += 1

    except KeyError as an_error:
        if an_error.args[0] == e["fideID"]:
            f.write("FIDE id not found %s\n" % e["fideID"])
            f.write("WIKIDATA: %s\n" % e["item"])

f.close()
f1.close()

df = pd.DataFrame(
    lst,
    columns=[
        "item_id",
        "statement_guid",
        "property_id",
        "wikidata_value",
        "meta_wikidata_value",
        "external_value",
        "external_url",
        "type",
    ],
)

In [12]:
df

Unnamed: 0,item_id,statement_guid,property_id,wikidata_value,meta_wikidata_value,external_value,external_url,type
0,Q77168,Q77168-37C631A8-7BE2-414A-A806-585C1BEE1EFB,P569,1971-05-01T00:00:00Z,Q1985727,1979,https://ratings.fide.com/profile/1004816,statement
1,Q103301,Q103301-1342BD85-9F9B-4795-B4D3-D5E9BEFE1B23,P569,1943-04-17T00:00:00Z,Q1985727,1945,https://ratings.fide.com/profile/4600185,statement
2,Q278844,Q278844-13624996-30D9-43C7-80D7-98294FB512D3,P569,1949-08-24T00:00:00Z,Q1985727,1946,https://ratings.fide.com/profile/900095,statement
3,Q326562,Q326562-16F44782-9AD3-4774-8E60-97C8B6F84F7F,P569,1942-01-01T00:00:00Z,Q1985727,1943,https://ratings.fide.com/profile/800139,statement
4,Q446773,Q446773-34830D20-DCDE-45D6-B865-38E55604179B,P569,1976-12-05T00:00:00Z,Q1985727,2008,https://ratings.fide.com/profile/15201759,statement
...,...,...,...,...,...,...,...,...
115,Q101530051,Q101530051-1232CC02-8C2E-4C6E-9F30-C2F52A9C84C4,P569,1941-01-01T00:00:00Z,Q1985727,1942,https://ratings.fide.com/profile/34161322,statement
116,Q115464159,Q115464159-2e278e84-4b29-5a27-679a-c76a6686e465,P569,1982-12-04T00:00:00Z,Q1985727,1994,https://ratings.fide.com/profile/3926370,statement
117,Q117225388,Q117225388-444e7389-43ef-3610-43f8-9ec6d50b14b5,P569,2001-01-01T00:00:00Z,Q1985727,2000,https://ratings.fide.com/profile/21873763,statement
118,Q122417673,Q122417673-9F8C8499-4F19-4246-B297-D5B7EF5D55BB,P569,1997-03-28T00:00:00Z,Q1985727,2000,https://ratings.fide.com/profile/13509403,statement


In [16]:
check_mf_formatting(df)

All checks have passed! The data is ready to be uploaded to Mismatch Finder.


In [17]:
df.to_csv("validated_players_data.csv", index=False)