In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import json
import numpy as np
import pandas as pd
import pickle
import requests
from statsmodels.formula.api import ols

In [4]:
with open('seasons.pickle') as f:
    seasons = pickle.load(f)
for year, season in seasons.iteritems():
    print year,
    season.load_scores()

2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018


In [5]:
with open('pd_data/name_fixes_all.json') as f:
    name_fixes = json.load(f)

In [None]:
owg = seasons['2018'].events[-1]

In [None]:
results18 = {}
for discipline in ('men', 'ladies', 'pairs', 'dance'):
    results18[discipline] = pd.read_csv('pd_data/results18_' + discipline + '.csv')
    print results18[discipline].columns
    results18[discipline].drop('Unnamed: 0', inplace=True, axis=1)
    if discipline != 'pairs':
        results18[discipline].drop(u'Unnamed: 0.1', inplace=True, axis=1)

In [47]:
for index, discipline_name in enumerate(('men', 'ladies', 'pairs', 'dance')):
    discipline = owg.disciplines[index]
    df = pd.read_csv(discipline.results_csv)
    if df.dtypes['Short Rank'] != np.dtype('int64'):
        df = df[df['Short Rank'] != 'WD']
    num_short = len(df)
    num_free = np.max([int(rank) for rank in df['Free Rank'] if rank not in ('DNQ', 'WD')])
    df['Num Short Scorecards'] = pd.Series([num_short for _ in xrange(num_short)])
    df['Num Free Scorecards'] = pd.Series([num_free for _ in xrange(num_short)])
#     df['Season'] = pd.Series([season.champ_year for _ in xrange(num_short)])

    df['Short Start'] = df.apply(lambda row: discipline.segments[0].scorecards[int(row['Short Rank']) - 1].starting_number, axis=1)
    df['Free Start'] = df.apply(lambda row:
                         None if str(row['Free Rank']).isalpha() else
                         discipline.segments[1].scorecards[int(row['Free Rank']) - 1].starting_number, 
                         axis=1)

    df.Name = df.apply(lambda row: name_fixes.get(row.Name, row.Name), axis=1)
    
    results18[discipline_name] = pd.concat([results18[discipline_name], df])

In [53]:
for discipline, df in results18.iteritems():
    df.Nation = df.apply(lambda row: 'RUS' if row.Nation == 'OAR' else row.Nation, axis=1)
    df.to_csv('pd_data/results18all_' + discipline + '.csv')

In [None]:
results18['ladies'].Nation = results18['ladies'].apply(lambda row: 'RUS' if row.Nation == 'OAR' else )

In [12]:
def get_entries(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    entries = []
    for tr in soup.find_all('tr')[1:]:
        for child in tr.children:
            if isinstance(child, Tag) and child.attrs.get('class') == ['first'] and child.text.isdigit():
                skater = tr.find('a').text
                for td in tr.find_all('td'):
                    if len(td.text) == 3 and td.text.isalpha():
                        country = td.text
                        entries.append((skater, country))
    return entries

In [13]:
entries = {}

In [14]:
for discipline, url in (('men', 'http://www.isuresults.com/events/cat03108710.htm'),
                        ('ladies', 'http://www.isuresults.com/events/cat03108711.htm'),
                        ('pairs', 'http://www.isuresults.com/events/cat03108712.htm'),
                        ('dance', 'http://www.isuresults.com/events/cat03108713.htm')):
    entries[discipline] = get_entries(url)

In [15]:
for discipline, entrylist in entries.iteritems():
    new_entries = []
    for skater, country in entrylist:
        new_entries.append((name_fixes.get(skater.strip(), skater.strip()), country))
    entries[discipline] = new_entries

In [16]:
entries_country = entries
entries = {discipline: map(lambda entry: entry[0], entry_list) for discipline, entry_list in entries.iteritems()}

In [17]:
component_to_type = {
    'Skating Skills': 'ss',
    'Transitions': 'tr',
    'Performance': 'pe',
    'Composition': 'co',
    'Interpretation': 'in'
}

In [131]:
for discipline in ('men', 'ladies', 'pairs', 'dance'):
    elements = pd.read_csv('pd_data/elements_' + discipline + '.csv')
    elements18 = pd.read_csv('pd_data/elements_' + discipline + '18all.csv')
    elements = pd.concat([elements, elements18])
    short_list = []
    free_list = []
    comp_list = []
    for skater in entries[discipline]:
        for component, comp_type in component_to_type.iteritems():
            comp_list.append({'component': component, 'comp_type': comp_type, 'skater': skater})
        last_event = elements[elements.skater == skater].event.unique()
        if len(last_event) == 0:
            print skater, 'event'
            continue
        last_event = last_event[-1]
        last_elements = elements[elements.skater == skater]
        last_elements = last_elements[last_elements.event == last_event]
        short_elements = last_elements[last_elements.segment.str.contains('short')]
        short_list.append(short_elements)
        if len(last_elements.segment.unique()) != 2:
            last_free_elements = elements[elements.skater == skater]
            last_free = last_free_elements[last_free_elements.segment.str.contains('free')].segment.unique()
            if len(last_free) == 0:
                print skater, 'free'
                continue
            last_free_elements = last_free_elements[last_free_elements.segment == last_free[-1]]
        else:
            last_free_elements = last_elements[last_elements.segment.str.contains('free')]
        free_list.append(last_free_elements)
    pd.concat(short_list).to_csv('pd_data/elements_worlds_short_' + discipline + '.csv')
    pd.concat(free_list).to_csv('pd_data/elements_worlds_free_' + discipline + '.csv')
    pd.DataFrame(comp_list).to_csv('pd_data/components_worlds_short_' + discipline + '.csv')
    pd.DataFrame(comp_list).to_csv('pd_data/components_worlds_free_' + discipline + '.csv')
    print


Alisa STOMAKHINA event
Elisabetta LECCARDI event
Stanislava KONSTANTINOVA event
Antonina DUBININA free

Elizaveta KASHITSYNA / Mark MAGYAR event

Teodora MARKOVA / Simon DAZE free
Adel TANKOVA / Ronald ZILBERBERG free
Allison REED / Saulius AMBRULEVICIUS event



In [153]:
for discipline in ('men', 'ladies', 'pairs', 'dance'):
    results = pd.read_csv('pd_data/results_nowd_nofra15_' + discipline + '.csv')
    results18 = pd.read_csv('pd_data/results18all_' + discipline + '.csv')
    results = pd.concat([results, results18])
    results['Short Score'] = results['Short Score'].astype('float')
    results['Points'] = results['Points'].astype('float')
    
    short_best = {skater: np.max(results[results.Name == skater]['Short Score']) for skater in results.Name.unique()}
    short_best['Alisa STOMAKHINA'] = 40.
    short_best['Elisabetta LECCARDI'] = 50.
    short_best['Stanislava KONSTANTINOVA'] = 65.
    short_best['Elizaveta KASHITSYNA / Mark MAGYAR'] = 45.
    short_best['Allison REED / Saulius AMBRULEVICIUS'] = 58.
    
    has_free = results[results['Free Rank'] != 'DNQ']
    has_free = has_free[has_free['Free Rank'] != 'WD']
    has_free['Free Score'] = has_free['Free Score'].astype('float')
    free_best = {skater: np.max(has_free[has_free.Name == skater]['Free Score']) for skater in has_free.Name.unique()}
    total_best = {skater: np.max(results[results.Name == skater].Points) for skater in results.Name.unique()}
    free_best['Stanislava KONSTANTINOVA'] = 130.
    total_best['Stanislava KONSTANTINOVA'] = 199.68
    
    short_predictions = pd.Series()
    for skater in entries[discipline]:
        if skater not in short_best:
            print skater, 'short'
            continue
        short_predictions[skater] = short_best[skater]        
    short_predictions = pd.Series(short_predictions).sort_values(ascending=False)
    num_qualify = 16 if discipline == 'pairs' else 24
    num_qualify = 20 if discipline == 'dance' else num_qualify
    free_qualify = short_predictions[:num_qualify]
    free_qualify_total = free_qualify.copy()
    dnq = short_predictions[num_qualify:]
    for skater in free_qualify.index:
        if skater not in total_best:
            print 'total', skater
            continue
        free_qualify_total[skater] = total_best[skater]
        if skater not in free_best:
            print 'free', skater
            continue
        free_qualify[skater] += free_best[skater]
    free_predictions = pd.DataFrame(pd.concat([free_qualify, dnq]).sort_values(ascending=False))
    free_total_predictions = pd.DataFrame(pd.concat([free_qualify_total, dnq]).sort_values(ascending=False))
    free_predictions['rank'] = [i + 1 for i in xrange(len(free_predictions))]
    free_total_predictions['rank'] = [i + 1 for i in xrange(len(free_total_predictions))]
    free_predictions.to_csv('fits/worlds_predictions_both_pbs_' + discipline + '.csv')
    free_total_predictions.to_csv('fits/worlds_predictions_total_pb_' + discipline + '.csv')

In [110]:
num_skaters = 20.

In [112]:
dance_free_start = {name_fixes.get(k, k): v/num_skaters for k, v in dance_free_start.iteritems()}

In [113]:
dance18 = pd.read_csv('pd_data/results18all_dance.csv')
history = pd.read_csv('pd_data/results_nowd_nofra15_dance.csv')
history = history[history.Season >= 2011]

In [114]:
individual_bests_short = {skater: np.max(map(float, history[history.Name == skater]['Short Score']))
                          for skater in history.Name.get_values()}
len(individual_bests_short)

153

In [115]:
have_frees = history[history['Free Rank'] != 'DNQ']
have_frees = have_frees[have_frees['Free Rank'] != 'WD']
individual_bests_free = {skater: np.max(map(float, have_frees[have_frees.Name == skater]['Free Score']))
                         for skater in have_frees.Name.get_values()}
len(individual_bests_free)

125

In [116]:
med_short = np.median(individual_bests_short.values())
med_free = np.median(individual_bests_free.values())
med_short, med_free

(52.049999999999997, 81.939999999999998)

In [117]:
dance18['normalized_short_start'] = dance18.apply(lambda row: float(row['Short Start']) / row['Num Short Scorecards'], axis=1)
dance18['normalized_free_start'] = dance18.apply(lambda row: None if pd.isna(row['Free Start']) else float(row['Free Start']) / row['Num Free Scorecards'], axis=1)
dance18['reputation_short'] = dance18.apply(lambda row: individual_bests_short.get(row.Name, med_short), axis=1)
dance18['reputation_free'] = dance18.apply(lambda row: individual_bests_free.get(row.Name, med_free), axis=1)

In [120]:
dance18.rename({'Short Score': 'short_score', 'Free Score': 'free_score'}, inplace=True, axis=1)

In [121]:
dance18 = dance18[pd.notnull(dance18.normalized_free_start)]

In [122]:
dance18.free_score = dance18.free_score.astype('float')
dance18.reputation_free = dance18.reputation_free.astype('float')

In [123]:
free_dance_model = ols('free_score ~ normalized_free_start + reputation_free',
                        data=dance18).fit()

In [124]:
free_dance_model.summary()

0,1,2,3
Dep. Variable:,free_score,R-squared:,0.845
Model:,OLS,Adj. R-squared:,0.842
Method:,Least Squares,F-statistic:,315.7
Date:,"Fri, 23 Mar 2018",Prob (F-statistic):,1.19e-47
Time:,21:59:46,Log-Likelihood:,-363.08
No. Observations:,119,AIC:,732.2
Df Residuals:,116,BIC:,740.5
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,30.6321,3.857,7.941,0.000,22.992,38.272
normalized_free_start,15.9626,2.242,7.120,0.000,11.522,20.403
reputation_free,0.6085,0.047,12.906,0.000,0.515,0.702

0,1,2,3
Omnibus:,8.143,Durbin-Watson:,1.826
Prob(Omnibus):,0.017,Jarque-Bera (JB):,9.873
Skew:,-0.401,Prob(JB):,0.00718
Kurtosis:,4.161,Cond. No.,831.0


In [125]:
# Update PBs.
for index, row in dance18.iterrows():
    if row.Name not in individual_bests_short or row.short_score > individual_bests_short[row.Name]:
        individual_bests_short[row.Name] = row.short_score
    if not pd.isna(row['Free Start']):
        if row.Name not in individual_bests_free or float(row.free_score) > individual_bests_free[row.Name]:
            individual_bests_free[row.Name] = float(row.free_score)

In [131]:
dance_predict_free_start = pd.DataFrame(
    [{'skater': key, 'start_order': value, 'reputation': individual_bests_free.get(key, med_free)}
     for key, value in dance_free_start.iteritems()])

In [132]:
dance_predict_free_start['prediction'] = dance_predict_free_start.apply(
    lambda row: free_dance_model.params.Intercept
                + free_dance_model.params.normalized_free_start * row.start_order
                + free_dance_model.params.reputation_free * row.reputation,
    axis=1
)

In [133]:
dance_predict_free_start = dance_predict_free_start.sort_values(by='prediction', ascending=False)

In [134]:
dance_predict_free_start['prediction_rank'] = [i+1 for i in xrange(len(dance_predict_free_start))]

In [135]:
dance_predict_free_start['real_start_order'] = dance_predict_free_start.apply(
    lambda row: int(round(row.start_order * num_skaters)), axis=1
)

In [136]:
dance_predict_free_start

Unnamed: 0,reputation,skater,start_order,prediction,prediction_rank,real_start_order
15,123.35,Gabriella PAPADAKIS / Guillaume CIZERON,0.9,120.055479,1,18
18,113.35,Madison HUBBELL / Zachary DONOHUE,1.0,115.566852,2,20
1,112.54,Kaitlyn WEAVER / Andrew POJE,0.95,114.275848,3,19
16,113.31,Madison CHOCK / Evan BATES,0.85,113.148127,4,17
12,112.07,Anna CAPPELLINI / Luca LANOTTE,0.8,111.595472,5,16
6,110.45,Piper GILLES / Paul POIRIER,0.7,109.013463,6,14
3,109.48,Alexandra STEPANOVA / Ivan BUKIN,0.65,107.625101,7,13
2,103.1,Tiffani ZAGORSKI / Jonathan GUERREIRO,0.75,105.339202,8,15
19,106.17,Charlene GUIGNARD / Marco FABBRI,0.6,104.812875,9,12
4,105.21,Kaitlin HAWAYEK / Jean-Luc BAKER,0.4,101.036211,10,8


In [137]:
dance_short_results = pd.read_csv('pd_data/worlds/dance_short_results.csv')

In [138]:
dance_predict_free_start['short_result'] = dance_predict_free_start.apply(
    lambda row: dance_short_results[dance_short_results.skater == row.skater].points.values[0], axis=1
)

In [139]:
dance_predict_free_start['prediction_total'] = dance_predict_free_start.apply(
    lambda row: row.short_result + row.prediction, axis=1
)

In [140]:
dance_predict_free_start = dance_predict_free_start.sort_values(by='prediction_total', ascending=False)

In [141]:
dance_short_results.rename({'segment_rank': 'prediction_rank'}, inplace=True, axis=1)

In [142]:
dnq = dance_short_results[dance_short_results.prediction_rank >= num_skaters + 1]

In [143]:
dnq['short_result'] = dnq.apply(lambda row: row.points, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [144]:
dnq.rename({'points': 'prediction_total'}, axis=1, inplace=True)

In [145]:
dance_predict_free_start = pd.concat([dance_predict_free_start, dnq])

In [146]:
dance_predict_free_start['prediction_final_rank'] = [i+1 for i in xrange(len(dance_predict_free_start))]

In [147]:
dance_predict_free_start

Unnamed: 0,prediction,prediction_rank,prediction_total,real_start_order,reputation,short_result,skater,start_order,prediction_final_rank
15,120.055479,1,203.785479,18.0,123.35,83.73,Gabriella PAPADAKIS / Guillaume CIZERON,0.9,1
18,115.566852,2,195.986852,20.0,113.35,80.42,Madison HUBBELL / Zachary DONOHUE,1.0,2
1,114.275848,3,192.585848,19.0,112.54,78.31,Kaitlyn WEAVER / Andrew POJE,0.95,3
12,111.595472,5,189.055472,16.0,112.07,77.46,Anna CAPPELLINI / Luca LANOTTE,0.8,4
16,113.148127,4,188.808127,17.0,113.31,75.66,Madison CHOCK / Evan BATES,0.85,5
6,109.013463,6,183.523463,14.0,110.45,74.51,Piper GILLES / Paul POIRIER,0.7,6
3,107.625101,7,182.125101,13.0,109.48,74.5,Alexandra STEPANOVA / Ivan BUKIN,0.65,7
2,105.339202,8,177.789202,15.0,103.1,72.45,Tiffani ZAGORSKI / Jonathan GUERREIRO,0.75,8
19,104.812875,9,175.962875,12.0,106.17,71.15,Charlene GUIGNARD / Marco FABBRI,0.6,9
11,99.402404,11,165.052404,11.0,98.59,65.65,Kana MURAMOTO / Chris REED,0.55,10


In [148]:
dance_predict_free_start.to_csv('fits/worlds_predictions/ols_dance_free.csv')

In [4]:
pairs_short_start = {'Elizaveta KASHITSYNA / Mark MAGYAR': 1, 'Paige CONNERS / Evgeni KRASNOPOLSKI': 2, 'Deanna STELLATO / Nathan BARTHOLOMAY': 3, 'Annika HOCKE / Ruben BLOMMAERT': 4, 'Anna DUSKOVA / Martin BIDAR': 5, 'Ioulia CHTCHETININA / Mikhail AKULOV': 6, 'Tae Ok RYOM / Ju Sik KIM': 7, 'Zoe JONES / Christopher BOYADJI': 8, 'Lola ESBRAT / Andrei NOVOSELOV': 9, 'Camille RUEST / Andrew WOLFE': 10, 'Miu SUZAKI / Ryuichi KIHARA': 11, 'Lana PETRANOVIC / Antonio SOUZA-KORDEIRU': 12, 'Laura BARQUERO / Aritz MAESTU': 13, 'Kyueun KIM / Alex Kang Chan KAM': 14, 'Kristina ASTAKHOVA / Alexei ROGONOV': 15, 'Cheng PENG / Yang JIN': 16, 'Ekaterina ALEXANDROVSKAYA / Harley WINDSOR': 17, 'Alexa SCIMECA KNIERIM / Chris KNIERIM': 18, 'Kirsten MOORE-TOWERS / Michael MARINARO': 19, 'Miriam ZIEGLER / Severin KIEFER': 20, 'Valentina MARCHEI / Ondrej HOTAREK': 21, 'Julianne SEGUIN / Charlie BILODEAU': 22, 'Xiaoyu YU / Hao ZHANG': 23, 'Nicole DELLA MONICA / Matteo GUARISE': 24, 'Evgenia TARASOVA / Vladimir MOROZOV': 25, 'Aljona SAVCHENKO / Bruno MASSOT': 26, 'Natalia ZABIIAKO / Alexander ENBERT': 27, 'Vanessa JAMES / Morgan CIPRES': 28,}

In [8]:
ladies_short_start = {'Dasa GRM': 1, 'Hanul KIM': 2, 'Xiangning LI': 3, 'Natasha MCKAY': 4, 'Elisabetta LECCARDI': 5, 'Viveca LINDFORS': 6, 'Angelina KUCHVALSKA': 7, 'Alisa STOMAKHINA': 8, 'Larkyn AUSTMAN': 9, 'Isadora WILLIAMS': 10, 'Gerli LIINAMAE': 11, 'Anne Line GJERSEM': 12, 'Antonina DUBININA': 13, 'Amy LIN': 14, 'Elzbieta KROPA': 15, 'Kailani CRAINE': 16, 'Eliska BREZINOVA': 17, 'Alexia PAGANINI': 18, 'Anita OSTLUND': 19, 'Bradie TENNELL': 20, 'Laurine LECAVELIER': 21, 'Nicole SCHOTT': 22, 'Loena HENDRICKX': 23, 'Stanislava KONSTANTINOVA': 24, 'Ivett TOTH': 25, 'Gabrielle DALEMAN': 26, 'Nicole RAJICOVA': 27, 'Dabin CHOI': 28, 'Elizabet TURSYNBAEVA': 29, 'Mirai NAGASU': 30, 'Mariah BELL': 31, 'Wakaba HIGUCHI': 32, 'Carolina KOSTNER': 33, 'Alina ZAGITOVA': 34, 'Maria SOTSKOVA': 35, 'Kaetlyn OSMOND': 36, 'Satoko MIYAHARA': 37,}

In [7]:
pairs_free_start = {'Ekaterina ALEXANDROVSKAYA / Harley WINDSOR': 1, 'Miriam ZIEGLER / Severin KIEFER': 2, 'Annika HOCKE / Ruben BLOMMAERT': 3, 'Anna DUSKOVA / Martin BIDAR': 4, 'Tae Ok RYOM / Ju Sik KIM': 5, 'Alexa SCIMECA KNIERIM / Chris KNIERIM': 6, 'Xiaoyu YU / Hao ZHANG': 7, 'Kirsten MOORE-TOWERS / Michael MARINARO': 8, 'Kristina ASTAKHOVA / Alexei ROGONOV': 9, 'Valentina MARCHEI / Ondrej HOTAREK': 10, 'Cheng PENG / Yang JIN': 11, 'Nicole DELLA MONICA / Matteo GUARISE': 12, 'Natalia ZABIIAKO / Alexander ENBERT': 13, 'Vanessa JAMES / Morgan CIPRES': 14, 'Aljona SAVCHENKO / Bruno MASSOT': 15, 'Evgenia TARASOVA / Vladimir MOROZOV': 16}

In [6]:
ladies_free_start = {'Alexia PAGANINI': 1, 'Elisabetta LECCARDI': 2, 'Kailani CRAINE': 3, 'Ivett TOTH': 4, 'Dabin CHOI': 5, 'Dasa GRM': 6, 'Mariah BELL': 7, 'Laurine LECAVELIER': 8, 'Eliska BREZINOVA': 9, 'Hanul KIM': 10, 'Stanislava KONSTANTINOVA': 11, 'Viveca LINDFORS': 12, 'Loena HENDRICKX': 13, 'Elizabet TURSYNBAEVA': 14, 'Nicole SCHOTT': 15, 'Bradie TENNELL': 16, 'Mirai NAGASU': 17, 'Wakaba HIGUCHI': 18, 'Kaetlyn OSMOND': 19, 'Maria SOTSKOVA': 20, 'Gabrielle DALEMAN': 21, 'Satoko MIYAHARA': 22, 'Alina ZAGITOVA': 23, 'Carolina KOSTNER': 24,}

In [5]:
men_short_start = {'Slavik HAYRAPETYAN': 1, 'Brendan KERRY': 2, 'Javier RAYA': 3, 'Burak DEMIRBOGA': 4, 'Chih-I TSAO': 5, 'Phillip HARRIS': 6, 'Romain PONSART': 7, 'Valtter VIRTANEN': 8, 'Julian Zhi Jie YEE': 9, 'Jinseo KIM': 10, 'Igor REZNICHENKO': 11, 'Kazuki TOMONO': 12, 'Abzal RAKIMGALIEV': 13, 'Nicholas VRDOLJAK': 14, 'Donovan CARRILLO': 15, 'Larry LOUPOLOVER': 16, 'Paul FENTZ': 17, 'Stephane WALKER': 18, 'Ivan PAVLOV': 19, 'Keegan MESSING': 20, 'Michal BREZINA': 21, 'Morisi KVITELASHVILI': 22, 'Matteo RIZZO': 23, 'Alexander MAJOROV': 24, 'Nam NGUYEN': 25, 'Daniel SAMOHIN': 26, 'Vincent ZHOU': 27, 'Max AARON': 28, 'Deniss VASILJEVS': 29, 'Keiji TANAKA': 30, 'Misha GE': 31, 'Mikhail KOLYADA': 32, 'Alexei BYCHENKO': 33, 'Nathan CHEN': 34, 'Dmitri ALIEV': 35, 'Boyang JIN': 36, 'Shoma UNO': 37,}

In [9]:
dance_short_start = {'Teodora MARKOVA / Simon DAZE': 1, 'Kavita LORENZ / Joti POLIZOAKIS': 2, 'Cortney MANSOUROVA / Michal CESKA': 3, 'Chantelle KERRY / Andrew DODDS': 4, 'Viktoria KAVALIOVA / Yurii BIELIAIEV': 5, 'Cecilia TORN / Jussiville PARTANEN': 6, 'Anna YANOVSKAYA / Adam LUKACS': 7, 'Carolane SOUCISSE / Shane FIRUS': 8, 'Allison REED / Saulius AMBRULEVICIUS': 9, 'Lucie MYSLIVECKOVA / Lukas CSOLLEY': 10, 'Adel TANKOVA / Ronald ZILBERBERG': 11, 'Yura MIN / Alexander GAMELIN': 12, 'Tina GARABEDIAN / Simon PROULX SENECAL': 13, 'Olivia SMART / Adrian DIAZ': 14, 'Lilah FEAR / Lewis GIBSON': 15, 'Alexandra NAZAROVA / Maxim NIKITIN': 16, 'Natalia KALISZEK / Maksym SPODYRIEV': 17, 'Marie-Jade LAURIAULT / Romain LE GAC': 18, 'Shiyue WANG / Xinyu LIU': 19, 'Alisa AGAFONOVA / Alper UCAR': 20, 'Tiffani ZAGORSKI / Jonathan GUERREIRO': 21, 'Piper GILLES / Paul POIRIER': 22, 'Alexandra STEPANOVA / Ivan BUKIN': 23, 'Kaitlin HAWAYEK / Jean-Luc BAKER': 24, 'Charlene GUIGNARD / Marco FABBRI': 25, 'Kana MURAMOTO / Chris REED': 26, 'Madison CHOCK / Evan BATES': 27, 'Madison HUBBELL / Zachary DONOHUE': 28, 'Anna CAPPELLINI / Luca LANOTTE': 29, 'Kaitlyn WEAVER / Andrew POJE': 30, 'Gabriella PAPADAKIS / Guillaume CIZERON': 31}

In [76]:
men_free_start = {'Julian Zhi Jie YEE': 1, 'Daniel SAMOHIN': 2, 'Slavik HAYRAPETYAN': 3, 'Brendan KERRY': 4, 'Phillip HARRIS': 5, 'Donovan CARRILLO': 6, 'Max AARON': 7, 'Romain PONSART': 8, 'Dmitri ALIEV': 9, 'Michal BREZINA': 10, 'Keiji TANAKA': 11, 'Matteo RIZZO': 12, 'Alexander MAJOROV': 13, 'Kazuki TOMONO': 14, 'Paul FENTZ': 15, 'Deniss VASILJEVS': 16, 'Misha GE': 17, 'Alexei BYCHENKO': 18, 'Keegan MESSING': 19, 'Shoma UNO': 20, 'Boyang JIN': 21, 'Mikhail KOLYADA': 22, 'Vincent ZHOU': 23, 'Nathan CHEN': 24, }

In [111]:
dance_free_start = {'Shiyue WANG / Xinyu LIU': 1, 'Alisa AGAFONOVA / Alper UCAR': 2, 'Alexandra NAZAROVA / Maxim NIKITIN': 3, 'Kavita LORENZ / Joti POLIZOAKIS': 4, 'Allison REED / Saulius AMBRULEVICIUS': 5, 'Carolane SOUCISSE / Shane FIRUS': 6, 'Marie-Jade LAURIAULT / Romain LE GAC': 7, 'Kaitlin HAWAYEK / Jean-Luc BAKER': 8, 'Natalia KALISZEK / Maksym SPODYRIEV': 9, 'Olivia SMART / Adrian DIAZ': 10, 'Kana MURAMOTO / Chris REED': 11, 'Charlene GUIGNARD / Marco FABBRI': 12, 'Alexandra STEPANOVA / Ivan BUKIN': 13, 'Piper GILLES / Paul POIRIER': 14, 'Tiffani ZAGORSKI / Jonathan GUERREIRO': 15, 'Anna CAPPELLINI / Luca LANOTTE': 16, 'Madison CHOCK / Evan BATES': 17, 'Gabriella PAPADAKIS / Guillaume CIZERON': 18, 'Kaitlyn WEAVER / Andrew POJE': 19, 'Madison HUBBELL / Zachary DONOHUE': 20, }

In [227]:
ladies_short_results = pd.read_csv('fits/worlds_pairs_short_results.csv')
ladies_short_predictions = pd.read_csv('fits/worlds_predictions_ols_pairs_short.csv')

In [217]:
ladies_short_predictions.columns

Index([u'Unnamed: 0', u'reputation', u'skater', u'start_order', u'prediction',
       u'prediction_rank', u'real_start_order'],
      dtype='object')

In [228]:
wrong = []
score_loss = 0.
rank_loss_top10 = 0
rank_loss = 0
for skater in ladies_short_predictions.skater.unique():
    result = ladies_short_results[ladies_short_results.skater == skater]
    prediction = ladies_short_predictions[ladies_short_predictions.skater == skater]
    result_rank = result.segment_rank.values[0]
    predicted_rank = prediction.prediction_rank.values[0]
    result_score = result.points.values[0]
    predicted_score = prediction.prediction.values[0]
    rank_loss += abs(result_rank - predicted_rank)
    score_loss += (result_score - predicted_score)**2
    if result_rank != predicted_rank:
        wrong.append((skater, predicted_rank, result_rank))
    if result_rank <= 10:
        rank_loss_top10 += abs(result_rank - predicted_rank)

In [229]:
score_loss, rank_loss, rank_loss_top10

(897.02168065138324, 66, 16)

In [223]:
wrong

[('Alina ZAGITOVA', 1, 2),
 ('Kaetlyn OSMOND', 2, 4),
 ('Carolina KOSTNER', 4, 1),
 ('Wakaba HIGUCHI', 6, 8),
 ('Mirai NAGASU', 7, 9),
 ('Gabrielle DALEMAN', 8, 6),
 ('Dabin CHOI', 9, 21),
 ('Elizabet TURSYNBAEVA', 10, 11),
 ('Mariah BELL', 11, 17),
 ('Laurine LECAVELIER', 12, 15),
 ('Bradie TENNELL', 13, 7),
 ('Nicole RAJICOVA', 14, 27),
 ('Ivett TOTH', 15, 24),
 ('Nicole SCHOTT', 16, 12),
 ('Loena HENDRICKX', 17, 10),
 ('Anita OSTLUND', 18, 29),
 ('Kailani CRAINE', 19, 20),
 ('Alexia PAGANINI', 20, 19),
 ('Amy LIN', 21, 28),
 ('Angelina KUCHVALSKA', 22, 37),
 ('Eliska BREZINOVA', 23, 18),
 ('Hanul KIM', 24, 14),
 ('Isadora WILLIAMS', 25, 35),
 ('Stanislava KONSTANTINOVA', 26, 16),
 ('Xiangning LI', 27, 26),
 ('Viveca LINDFORS', 28, 13),
 ('Anne Line GJERSEM', 29, 33),
 ('Larkyn AUSTMAN', 30, 25),
 ('Gerli LIINAMAE', 33, 34),
 ('Alisa STOMAKHINA', 34, 30),
 ('Antonina DUBININA', 35, 36),
 ('Elisabetta LECCARDI', 36, 23),
 ('Dasa GRM', 37, 22)]

In [3]:
DISCPLINES = ('men', 'ladies', 'pairs', 'dance')

In [83]:
results = {}
hm3 = {}
lr_short = {}
lr_free = {}
for discipline in DISCPLINES:
    results[discipline] = pd.read_csv('pd_data/worlds/' + discipline + '_results.csv', index_col='skater')
    lr_short[discipline] = pd.read_csv('fits/worlds_predictions/ols_' + discipline + '_short.csv', index_col='skater')
    lr_free[discipline] = pd.read_csv('fits/worlds_predictions/ols_' + discipline + '_free.csv', index_col='skater')
    lr_free[discipline].prediction = lr_free[discipline].apply(
        lambda row: 0. if pd.isnull(row.prediction) else row.prediction, axis=1
    )
    lr_short[discipline].prediction += lr_free[discipline].prediction
    lr_short[discipline] = lr_short[discipline].sort_values(by='prediction', ascending=False)
    lr_short[discipline]['predicted_final_rank'] = [i+1 for i in xrange(len(lr_short[discipline]))]
    hm3[discipline] = pd.read_csv('fits/worlds_predictions/breakdown_prediction_' + discipline +'.csv', index_col='skater')
    hm3[discipline].rename({'rank':'placement'}, inplace=True, axis=1)

In [84]:
for discipline in DISCPLINES:
    print discipline
    ols_score_loss = 0.
    ols_rank_loss = 0.
    hm3_score_loss = 0.
    hm3_rank_loss = 0.
    num_entries = len(results[discipline])
    for skater in results[discipline].index:
        actual_points = results[discipline].loc[skater].points
        actual_rank   = results[discipline].loc[skater].placement
        ols_points    = lr_short[discipline].loc[skater].prediction
        ols_rank      = lr_short[discipline].loc[skater].predicted_final_rank
        hm3_points    = hm3[discipline].loc[skater].prediction
        hm3_rank      = hm3[discipline].loc[skater].placement
        ols_score_loss += (actual_points - ols_points)**2
        ols_rank_loss  += abs(actual_rank - ols_rank)
        hm3_score_loss += (actual_points - hm3_points)**2
        hm3_rank_loss  += abs(actual_rank - hm3_rank)
    print ' ols'
    print '  ', ols_score_loss, np.sqrt(ols_score_loss / num_entries), ols_rank_loss, ols_rank_loss / num_entries
    print ' hm2'
    print '  ', hm3_score_loss, np.sqrt(hm3_score_loss / num_entries), hm3_rank_loss, hm3_rank_loss / num_entries
#         print skater, actual_points, actual_rank, 'ols', actual_points - ols_points, actual_rank - ols_rank, 'hm3', actual_points - hm3_points, actual_rank - hm3_rank

men
 ols
   8419.8958662 15.085249522 104.0 2.81081081081
 hm2
   188489.28528 71.3744002256 194.0 5.24324324324
ladies
 ols
   16857.3357395 21.3448745284 108.0 2.91891891892
 hm2
   92783.7262822 50.0766240665 168.0 4.54054054054
pairs
 ols
   2006.60582518 8.46548839444 62.0 2.21428571429
 hm2
   67821.5483081 49.2158323496 80.0 2.85714285714
dance
 ols
   739.478693021 4.88407118988 38.0 1.22580645161
 hm2
   34691.9527795 33.4528810561 48.0 1.54838709677


In [97]:
lr_free['dance']

Unnamed: 0_level_0,Unnamed: 0,prediction,prediction_rank,prediction_total,real_start_order,reputation,short_result,start_order,prediction_final_rank
skater,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gabriella PAPADAKIS / Guillaume CIZERON,15,120.055479,1,203.785479,18.0,123.35,83.73,0.9,1
Madison HUBBELL / Zachary DONOHUE,18,115.566852,2,195.986852,20.0,113.35,80.42,1.0,2
Kaitlyn WEAVER / Andrew POJE,1,114.275848,3,192.585848,19.0,112.54,78.31,0.95,3
Anna CAPPELLINI / Luca LANOTTE,12,111.595472,5,189.055472,16.0,112.07,77.46,0.8,4
Madison CHOCK / Evan BATES,16,113.148127,4,188.808127,17.0,113.31,75.66,0.85,5
Piper GILLES / Paul POIRIER,6,109.013463,6,183.523463,14.0,110.45,74.51,0.7,6
Alexandra STEPANOVA / Ivan BUKIN,3,107.625101,7,182.125101,13.0,109.48,74.5,0.65,7
Tiffani ZAGORSKI / Jonathan GUERREIRO,2,105.339202,8,177.789202,15.0,103.1,72.45,0.75,8
Charlene GUIGNARD / Marco FABBRI,19,104.812875,9,175.962875,12.0,106.17,71.15,0.6,9
Kana MURAMOTO / Chris REED,11,99.402404,11,165.052404,11.0,98.59,65.65,0.55,10


In [63]:
for discipline in DISCPLINES:
    print len(results[discipline])

37
37
28
31


In [59]:
lr_short['dance']

Unnamed: 0_level_0,Unnamed: 0,reputation,start_order,prediction,real_start_order,predicted_final_rank
skater,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gabriella PAPADAKIS / Guillaume CIZERON,19,82.07,1.0,200.54502,31,1
Kaitlyn WEAVER / Andrew POJE,0,77.47,0.967742,191.37362,30,2
Madison HUBBELL / Zachary DONOHUE,28,77.75,0.903226,192.41809,28,3
Anna CAPPELLINI / Luca LANOTTE,24,76.57,0.935484,187.852837,29,4
Madison CHOCK / Evan BATES,25,76.25,0.870968,188.745223,27,5
Alexandra STEPANOVA / Ivan BUKIN,21,75.38,0.741935,181.743061,23,6
Piper GILLES / Paul POIRIER,22,72.83,0.709677,181.153248,22,7
Charlene GUIGNARD / Marco FABBRI,29,71.58,0.806452,176.750129,25,8
Kaitlin HAWAYEK / Jean-Luc BAKER,7,69.08,0.774194,171.029767,24,9
Tiffani ZAGORSKI / Jonathan GUERREIRO,15,67.62,0.677419,173.666588,21,10
