In [3]:
import pandas as pd
import trueskillthroughtime as ttt
import unidecode
import numpy as np
import os
import json

# ifsc.results.info data to trueskill
takes the data directory and creates trueskill rankings. can be filtered using the selected events

In [2]:
#callable functions
# TODO add support for speed and lead as well as grp A B qualis
def get_boulder(round):
    filter = {'athlete_id', 'rank', 'name', 'country'}
    filtered_rounds = []
    for player in round:
        # print(player)
        # Create a dictionary for each player with only the desired keys
        filtered_player = {k: player[k] for k in filter if k in player}
        # Append the filtered dictionary to the list
        for player_round in player['rounds']:
            match player_round['round_name']:
                case 'Qualification' | 'Final' | 'Semi-Final': # maybe fuzzy match qualification a/b options, must test with 2018 years older...
                    round_type = player_round['round_name'].replace('-', '').lower()  # Normalize the round name
                    if 'speed_elimination_stages' in player_round:
                        ascents = player_round['speed_elimination_stages']['ascents'] # grp a/b seems to remove speed elim stage
                    else:
                        ascents = player_round['ascents']
                                        
                    
                    # Enumerate through ascents and extract required information
                    for index, boulder in enumerate(ascents):
                        # We will create a consistent data structure for easy access
                        boulder_info = {key: boulder[key] for key in ['id', 'top', 'top_tries', 'zone', 'zone_tries','starting_group'] if key in boulder}
                        filtered_player[f"{round_type}_boulder_{index + 1}"] = boulder_info
                    
        filtered_rounds.append(filtered_player)
    return pd.DataFrame(filtered_rounds)

def get_lead(leadRound):
    filter = {'athlete_id', 'rank', 'name', 'country'}
    lead_filtered_rounds = []

    # lead version
    for player in leadRound:
        # print(player)
        # Create a dictionary for each player with only the desired keys
        filtered_player = {k: player[k] for k in filter if k in player}
        # Append the filtered dictionary to the list
        for player_round in player['rounds']:
            match player_round['round_name']:
                case 'Qualification' | 'Final' | 'Semi-Final': # maybe fuzzy match qualification a/b options, must test with 2018 years older...
                    round_type = player_round['round_name'].replace('-', '').lower()  # Normalize the round name
                    ascents = player_round['ascents']
                    
                    # Enumerate through ascents and extract required information
                    for index, boulder in enumerate(ascents):
                        # We will create a consistent data structure for easy access
                        lead_info = {key: boulder[key] for key in ['route_id', 'score', 'top', 'plus', 'rank', 'corrective_rank'] if key in boulder}
                        filtered_player[f"{round_type}_lead_{index + 1}"] = lead_info
        lead_filtered_rounds.append(filtered_player)
    return pd.DataFrame(lead_filtered_rounds)


def create_composition(frame):
    #print(frame.columns)
    n_qualis = frame.columns.str.contains('qualification_boulder_').sum()
    for i in range(1, n_qualis + 1):
        frame[f'q_b_{i}_top'] = frame[f'qualification_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['top'] else np.NaN)
        frame[f'q_b_{i}_top_tries'] = frame[f'qualification_boulder_{i}'].apply(lambda x: x['top_tries'] if isinstance(x, dict) else np.NaN)
        frame[f'q_b_{i}_zone'] = frame[f'qualification_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['zone'] else np.NaN)
        frame[f'q_b_{i}_zone_tries'] = frame[f'qualification_boulder_{i}'].apply(lambda x: x['zone_tries'] if isinstance(x, dict) else np.NaN)

        #per boulder ranking
        frame[[f'athlete_id',f'q_b_{i}_top',f'q_b_{i}_top_tries',f'q_b_{i}_zone',f'q_b_{i}_zone_tries']].sort_values([f'q_b_{i}_top',f'q_b_{i}_top_tries',f'q_b_{i}_zone',f'q_b_{i}_zone_tries'],na_position='last')
        frame[f'q_b_{i}_rank'] = frame[[f'q_b_{i}_top',f'q_b_{i}_top_tries',f'q_b_{i}_zone',f'q_b_{i}_zone_tries']].apply(tuple,axis=1).rank(method='dense',ascending=True) # i still dont understand why this works 
    
    n_finals = sum('finals' in col for col in frame.columns) # 
    for i in range(1, n_finals + 1):
        frame[f'f_b_{i}_top'] = frame[f'final_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['top'] else np.NaN)
        frame[f'f_b_{i}_top_tries'] = frame[f'final_boulder_{i}'].apply(lambda x: x['top_tries'] if isinstance(x, dict) else np.NaN)
        frame[f'f_b_{i}_zone'] = frame[f'final_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['zone'] else np.NaN)
        frame[f'f_b_{i}_zone_tries'] = frame[f'final_boulder_{i}'].apply(lambda x: x['zone_tries'] if isinstance(x, dict) else np.NaN)
        
        frame[[f'athlete_id',f'f_b_{i}_top',f'f_b_{i}_top_tries',f'f_b_{i}_zone',f'f_b_{i}_zone_tries']].sort_values([f'f_b_{i}_top',f'f_b_{i}_top_tries',f'f_b_{i}_zone',f'f_b_{i}_zone_tries'],na_position='last')
        frame[f'f_b_{i}_rank'] = frame[[f'f_b_{i}_top',f'f_b_{i}_top_tries',f'f_b_{i}_zone',f'f_b_{i}_zone_tries']].apply(tuple,axis=1).rank(method='dense',ascending=True)
        
        
    n_semis = frame.columns.str.contains('semifinal_boulder_').sum()
    for i in range(1, n_semis + 1):
        frame[f's_b_{i}_top'] = frame[f'semifinal_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['top'] else np.NaN)
        frame[f's_b_{i}_top_tries'] = frame[f'semifinal_boulder_{i}'].apply(lambda x: x['top_tries'] if isinstance(x, dict) else np.NaN)
        frame[f's_b_{i}_zone'] = frame[f'semifinal_boulder_{i}'].apply(lambda x: 1 if isinstance(x, dict) and x['zone'] else np.NaN)
        frame[f's_b_{i}_zone_tries'] = frame[f'semifinal_boulder_{i}'].apply(lambda x: x['zone_tries'] if isinstance(x, dict) else np.NaN)
        
        frame[[f'athlete_id',f's_b_{i}_top',f's_b_{i}_top_tries',f's_b_{i}_zone',f's_b_{i}_zone_tries']].sort_values([f's_b_{i}_top',f's_b_{i}_top_tries',f's_b_{i}_zone',f's_b_{i}_zone_tries'],na_position='last')
        frame[f's_b_{i}_rank'] = frame[[f's_b_{i}_top',f's_b_{i}_top_tries',f's_b_{i}_zone',f's_b_{i}_zone_tries']].apply(tuple,axis=1).rank(method='dense',ascending=True)
    
    # create a composition for each boulder, with athlete_id as a player

    # frame['boulder_1_ranks'] = frame['boulder_1'].apply(lambda x: x.split(' ')[0]
    return frame#[['athlete_id','rank','name','qualification_boulder_1','boulder_1_top','boulder_1_top_tries','boulder_1_zone','boulder_1_zone_tries']]

# create_composition(get_boulder(round))

In [4]:
#main process

data_folder = '../data'

output_folder = os.path.join(data_folder, 'output')
output_events_folder = os.path.join(data_folder, 'outputEvents')
full_results_folder = os.path.join(data_folder, 'outputFullResults')
athlete_folder = os.path.join(data_folder, 'athlete')

output_data = []
output_events_data = []
full_results_data = []
athlete_data = []

# Load data from output folder
for filename in os.listdir(output_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(output_folder, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            output_data.append(data)

# Load data from outputEvents folder
for filename in os.listdir(output_events_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(output_events_folder, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            output_events_data.append(data)

# Load data from FullResults folder
for filename in os.listdir(full_results_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(full_results_folder, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            data['filename'] = filename
            full_results_data.append(data)

for filename in os.listdir(athlete_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(athlete_folder, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            athlete_data.append(data)
# Load data from athlete folder 


allEvents = pd.DataFrame(full_results_data)
allEvents['ranking'].dropna(inplace=True)
allEvents['ranking_as_of'].dropna(inplace=True)
allEvents['date'] = pd.to_datetime(allEvents['ranking_as_of'] , errors='coerce')

athletes = pd.DataFrame(athlete_data)
athletes = athletes[['id', 'firstname', 'lastname', 'birthday', 'gender', 'country', 'all_results']]
athletes['all_results']
def split_filename(name):
    return int(name.split('_')[1])

allEvents['id'] = allEvents['filename'].apply(split_filename)
allEvents['cid'] = allEvents['filename'].apply(lambda x: x.split('_')[2][:-5]) # selects the last number split by _ and removes the .json (5 char)
allEvents.set_index('id', inplace=True)
events = pd.DataFrame(output_events_data)
events.set_index('id', inplace=True)

joined_df = allEvents.join(events)
joined_df.dropna(subset=['ranking'], inplace=True)

joined_df[joined_df.ranking.notna()][['event', 'starts_at', 'dcat', 'ranking', 'location','cid', 'league_id', 'league_season_id', 'season_id']].sort_values('starts_at')

selected_events = joined_df[
    (joined_df['league_id'] == 1) & (
        # (joined_df['dcat'] == 'LEAD Men') | 
        # (joined_df['dcat'] == 'LEAD Women') | 
        (joined_df['dcat'] == 'BOULDER Men')  
        # | (joined_df['dcat'] == 'BOULDER Women')
    ) &
    (joined_df['starts_at'] > '2007-01-01') & # 2007 is the first year of the IFSC - pre 2007 per boulder rankings dont exist
    (joined_df['starts_at'] < '2024-01-01')
][['event', 'starts_at', 'dcat','cid', 'ranking', 'location', 'league_id', 'league_season_id', 'season_id']].sort_values('starts_at')
selected_events

processed_events = []
errored_events = []
for event in selected_events.itertuples(index=True):
    # print(event.Index)
    try:
        c = create_composition(get_boulder(event.ranking))
        processed_events.append({'event_id': event.Index,'cid':event.cid,'event_name':event.event, 'starts_at':event.starts_at,'dataframe': c}) # dict of eventID and then the comp df
        
    except:
        errored_events.append(event)
        # print(event)
    # print(event.ranking[0])
   
print(f"{len(processed_events)} processed events and {len(errored_events)} errored events")     
    



temp = pd.DataFrame(processed_events)
test_comp_final_results = []
for row in temp.itertuples():
    test_comp_final_results.append(row.dataframe.athlete_id.apply(lambda x: [x]).to_list())

h = ttt.History(composition=test_comp_final_results)
h.convergence()
lc = pd.DataFrame.from_dict(h.learning_curves(),orient='index')
lc['current'] = lc.apply(lambda x: x[x.last_valid_index()][1],axis=1)
lc['sigma'] = lc['current'].apply(lambda x: x.sigma)
lc['mu'] = lc['current'].apply(lambda x: x.mu)
lc[['current', 'sigma', 'mu']].sort_values(by=['mu'],ascending=False).head(20)
lc.join(athletes.set_index('id'))[['firstname','lastname','current', 'sigma', 'mu']].sort_values(by=['mu'],ascending=False)#.to_csv('output_men.csv')

108 processed events and 8 errored events
Iteration =  0 , step =  (10.433630266086908, 2.9130396399537704)
Iteration =  1 , step =  (0.44897565254086125, 0.1605280789624428)
Iteration =  2 , step =  (0.08555472489639238, 0.015946887523178077)
Iteration =  3 , step =  (0.04530834956818275, 0.003365253523368583)
Iteration =  4 , step =  (0.03077419429991568, 0.0024312397205230596)
Iteration =  5 , step =  (0.024012605467288894, 0.001983219924457824)
Iteration =  6 , step =  (0.0205665140574105, 0.0020004376200692953)
Iteration =  7 , step =  (0.018723464149536073, 0.002163540817472054)
Iteration =  8 , step =  (0.017839117904512314, 0.002219761295821243)
Iteration =  9 , step =  (0.017291433416398494, 0.002224799079634465)
Iteration =  10 , step =  (0.016967730966948036, 0.002205511944788352)
Iteration =  11 , step =  (0.016646544553608145, 0.002174861088980684)
Iteration =  12 , step =  (0.016324967713015948, 0.0021391017813083835)
Iteration =  13 , step =  (0.01600648221205203, 0.0021

Unnamed: 0,firstname,lastname,current,sigma,mu
1364,Adam,ONDRA,"N(mu=4.226, sigma=0.221)",0.220824,4.226338
13040,Sorato,ANRAKU,"N(mu=4.159, sigma=0.408)",0.407977,4.158589
1204,Kilian,FISCHHUBER,"N(mu=4.123, sigma=0.194)",0.193825,4.123029
2276,Tomoa,NARASAKI,"N(mu=4.040, sigma=0.188)",0.187828,4.040242
11675,Mejdi,SCHALCK,"N(mu=3.929, sigma=0.294)",0.294103,3.929157
...,...,...,...,...,...
4444,Abdulla,ALDOSERI,"N(mu=-8.187, sigma=3.003)",3.002784,-8.187233
5860,Mario,LACKNER,"N(mu=-8.634, sigma=3.291)",3.290856,-8.634462
2650,Mandeep,KODAN,"N(mu=-9.873, sigma=3.093)",3.093044,-9.872740
7410,Artem,AVDEENKO,"N(mu=-10.145, sigma=3.206)",3.205880,-10.145233


In [1]:
#save to csv?
lc.join(athletes.set_index('id'))[['firstname','lastname','current', 'sigma', 'mu']].sort_values(by=['mu'],ascending=False)#.to_csv('output_men.csv')

In [None]:
#select boulder and lead events, extract r0ound data into objects tagged with event info like id, cid, name, date. essential. 

