In [299]:
import dill
import re
import glob

import pandas as pd
import trueskill as ts
import numpy as np

ts.setup(backend='mpmath')

from preprocess import clean

In [300]:
df_meta = pd.read_pickle('C:/data/results/df.pkl')
df_meta.sample(5)

Unnamed: 0_level_0,name,date,loc,json_url,weather,strava_url,coord
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1089,Northern California/Nevada Time Trial Champion...,2011-06-12 00:00:00,,downloadrace.php?raceID=1089&json=1,,http://app.strava.com/segments/,
3176,Tour of Galena - Criterium,2012-06-09 00:00:00,"Galena, IL",downloadrace.php?raceID=3176&json=1,"Clear, 82 degrees, wind 14 mph",http://app.strava.com/segments/,
3150,Tour of Galena - TT,2013-06-08 00:00:00,"Galena, IL",downloadrace.php?raceID=3150&json=1,"Partly Cloudy, 69 degrees, wind 6 mph",http://app.strava.com/segments/,
4240,The Bunny Hop Criterium,2014-05-04 00:00:00,"Suitland, MD",downloadrace.php?raceID=4240&json=1,"Partly Cloudy, 71 degrees, wind 13 mph",http://app.strava.com/segments/,
4397,Killington Stage Race Circuit Race - Stage 1,2014-05-24 00:00:00,"Killington, VT",downloadrace.php?raceID=4397&json=1,"Partly Cloudy, 60 degrees, wind 2 mph",http://app.strava.com/segments/,


In [301]:
json = dill.load(open('C:\\data\\results\\races\\1000.pkd', 'rb'))

df_race = pd.read_json(json)
df_race = clean(df_race).assign(race_id=1000, rating_prev=ts.Rating(), rating=ts.Rating())

df_all_races = pd.DataFrame(columns = df_race.columns)

In [302]:
df_all_races

Unnamed: 0,Place,RaceTime,Name,Age,Category,RacerID,TeamID,TeamName,RaceName,RaceCategoryName,IsDQ,race_id,rating_prev,rating


In [303]:
# Dataframe to store most recent info for each racer
df_racers = pd.DataFrame(columns=['Name', 'Age', 'Category', 'RacerID', 'TeamID', 'TeamName', 'rating'])
df_racers = df_racers.set_index('RacerID')

In [304]:
def get_prev_rating(row):
    racer_id = row['RacerID']
    if racer_id not in df_racers.index: # Racer we haven't seen before
        df_racers.loc[racer_id] = row
        df_racers.loc[racer_id, 'rating'] = [ts.Rating()]
        
    row['rating_prev'] = df_racers.loc[racer_id, 'rating']
    return row
            
    
def get_ratings(df):
    df = df[~df['Place'].isna()]
    placing = df['Place']
    if not (placing.sort_values().values == placing.values).all():
        raise Exception('Placing not in order!')
        
    new_ratings = ts.rate([(df_racers.loc[racer_id, 'rating'],) for racer_id in df['RacerID']])
    df = df.assign(rating=[r[0] for r in new_ratings])
    return df

    
for j, (index, row) in enumerate(df_meta.iterrows()):
    json = dill.load(open(f'C:\\data\\results\\races\\{index}.pkd', 'rb'))

    df_race = pd.read_json(json)
    if df_race.empty:
        continue
        
    # "previous rating" column - initialize with default rating and update if already rated
    df_race = clean(df_race).assign(race_id=int(index), rating_prev=ts.Rating(), rating=ts.Rating())
    df_race = df_race.apply(get_prev_rating, axis=1)
    
    # Group by race category and get new ratings
    g = df_race.groupby('RaceCategoryName')
    for name, group in g:
        new_ratings = get_ratings(group)
        df_race.update(new_ratings)
        df_racers.update(new_ratings)
    
    df_all_races = pd.concat([df_all_races, df_race])
        
    if j == 10:
        break

In [334]:
df_racers.sample(5)

Unnamed: 0_level_0,Name,Age,Category,TeamID,TeamName,rating
RacerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
878,Jonathan Bielik,,,1305.0,Benidorm,"trueskill.Rating(mu=25.000, sigma=8.333)"
1246,Andrew Randell,,,1436.0,Symmetrics,"trueskill.Rating(mu=25.000, sigma=8.333)"
1238,Michelle Bishop,29.0,3.0,,,"trueskill.Rating(mu=25.000, sigma=8.333)"
1598,Leo Deveilian,,,975.0,CCB/VW,"trueskill.Rating(mu=25.000, sigma=8.333)"
1656,Rami El rayess,30.0,4.0,1601.0,Slouch/NorEast,"trueskill.Rating(mu=25.000, sigma=8.333)"


In [285]:
df_all_races.head()

Unnamed: 0,Place,RaceTime,Name,Age,Category,RacerID,TeamID,TeamName,RaceName,RaceCategoryName,IsDQ,race_id,rating_prev,rating
0,1.0,2:28:53,Matt Cuttler,29.0,3.0,699,12737.0,CRCA/Affinity Cycles,NCC 10th Tour of the Hilltowns,Cat 4,False,2,"trueskill.Rating(mu=25.000, sigma=8.333)","trueskill.Rating(mu=25.000, sigma=8.333)"
1,2.0,st,Eric Weinrich,42.0,3.0,700,1249.0,portland velo-cycle-mania,NCC 10th Tour of the Hilltowns,Cat 4,False,2,"trueskill.Rating(mu=25.000, sigma=8.333)","trueskill.Rating(mu=25.000, sigma=8.333)"
2,3.0,st,Michael Boardman,34.0,3.0,701,,,NCC 10th Tour of the Hilltowns,Cat 4,False,2,"trueskill.Rating(mu=25.000, sigma=8.333)","trueskill.Rating(mu=25.000, sigma=8.333)"
3,4.0,st,John Nobile,45.0,4.0,702,,,NCC 10th Tour of the Hilltowns,Cat 4,False,2,"trueskill.Rating(mu=25.000, sigma=8.333)","trueskill.Rating(mu=25.000, sigma=8.333)"
4,5.0,st,Ryan Short,20.0,4.0,703,1222.0,TEAM PLACID PLANET,NCC 10th Tour of the Hilltowns,Cat 4,False,2,"trueskill.Rating(mu=25.000, sigma=8.333)","trueskill.Rating(mu=25.000, sigma=8.333)"
