In [81]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

In [82]:
with open('season2018.pickle') as f:
    season2018 = pickle.load(f)

In [83]:
elements_pred = pd.read_csv('fits/men_multi_elts_1.csv')
components_pred = pd.read_csv('fits/men_multi_comp_1.csv')

In [89]:
len(components_pred[components_pred.points > components_pred.partial_pool_predictions])

1226

In [90]:
len(components_pred)

1320

In [33]:
elements_pred.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,bonus,date,element,elt_type,event,goe,info,number,points,segment,segment_rank,skater,start_order,skater_code,partial_pool_prediction,pooled_prediction,unpooled_prediction
0,0,0,0,False,2017-10-22,4Lz+3T,4j,gprus2017,1.57,,1,19.47,gprus2017 men_short,1,Nathan CHEN,10,205,12.647146,9.578746,13.102899
1,1,1,1,False,2017-10-22,FSSp4,sp,gprus2017,0.86,,2,3.86,gprus2017 men_short,1,Nathan CHEN,10,205,3.773436,3.062927,3.78773
2,2,2,2,False,2017-10-22,CCSp3,sp,gprus2017,0.21,,3,3.01,gprus2017 men_short,1,Nathan CHEN,10,205,3.734792,3.062927,3.78773
3,3,3,3,True,2017-10-22,4F,4j,gprus2017,-1.03,,4,12.5,gprus2017 men_short,1,Nathan CHEN,10,205,12.459942,9.578746,13.102899
4,4,4,4,True,2017-10-22,3A,3j,gprus2017,-0.86,,5,8.49,gprus2017 men_short,1,Nathan CHEN,10,205,8.157336,6.923761,8.681645


In [84]:
# 2018 data
men = pd.read_csv('pd_data/results18_men.csv')
ladies = pd.read_csv('pd_data/results18_ladies.csv')
pairs = pd.read_csv('pd_data/results18_pairs.csv')
dance = pd.read_csv('pd_data/results18_dance.csv')

In [92]:
with open('season2018.pickle') as f:
    season18 = pickle.load(f)
season18.load_scores()

In [103]:
history = pd.read_csv('pd_data/results_nowd_nofra15_men.csv')

In [95]:
def get_comparison(estimate_type):
    comparison = []
    for event in season2018.events:
        event_name = event.name
        elts = elements_pred[elements_pred.event == event_name].groupby('skater')[estimate_type].sum()
        comp_short = components_pred[components_pred.segment == event_name + ' men_short'].groupby('skater')[estimate_type + 's'].sum()
        comp_free = 2 * components_pred[components_pred.segment == event_name + ' men_free'].groupby('skater')[estimate_type + 's'].sum()
        predictions = {}
        outcomes = {}
        total_score = elts + comp_short
        for skater in total_score.index:
            predictions[skater] = total_score.ix[skater]
            if skater in comp_free.index:
                predictions[skater] += comp_free.ix[skater]
        for index, row in men[men.Event == event_name][['Name', 'Points']].iterrows():
            outcomes[row.Name] = row.Points
        comparison.append((predictions, outcomes))
    return comparison

In [96]:
def get_ranks(result_dict):
    ranks = {}
    for (i, (skater, _)) in enumerate(sorted(result_dict.iteritems(), key=lambda (k,v): -1 * v)):
        ranks[skater] = i+1
    return ranks

In [97]:
def calculate_loss(comparison):
    score_difference = 0.
    rank_mistake = 0
    rank_miss = {}
    for i, (predictions, outcomes) in enumerate(comparison):
        for skater in predictions:
            score_difference += (predictions[skater] - outcomes[skater])**2
            if pd.isnull(score_difference):
                print i, skater
        predicted_ranks = get_ranks(predictions)
        actual_ranks = get_ranks(outcomes)
        for skater in predictions:
            diff = abs(predicted_ranks[skater] - actual_ranks[skater])
            if diff:
                if skater not in rank_miss:
                    rank_miss[skater] = []
                rank_miss[skater].append((predicted_ranks[skater], actual_ranks[skater]))
                rank_mistake += diff
    return score_difference, rank_mistake, rank_miss

In [98]:
score_differences = {}
rank_mistakes = {}
rank_misses = {}
for estimate_type in ('pooled_prediction', 'unpooled_prediction', 'partial_pool_prediction'):
    comparison = get_comparison(estimate_type)
    score_difference, rank_mistake, rank_miss = calculate_loss(comparison)
    score_differences[estimate_type] = score_difference
    rank_mistakes[estimate_type] = rank_mistake
    rank_misses[estimate_type] = rank_miss

In [99]:
rank_mistakes, score_differences

({'partial_pool_prediction': 302,
  'pooled_prediction': 376,
  'unpooled_prediction': 380},
 {'partial_pool_prediction': 91053.292305282477,
  'pooled_prediction': 204946.19175774002,
  'unpooled_prediction': 140457656.8769297})

In [100]:
with open('fits/men_reputation_start_pred.csv') as f:
    short_pred, free_pred = pickle.load(f)

In [101]:
men.Event.unique()

array(['gprus2017', 'gpcan2017', 'gpchn2017', 'gpjpn2017', 'gpfra2017',
       'gpusa2017', 'gpf1718', 'ec2018', 'fc2018'], dtype=object)

In [104]:
individual_bests_short = {skater: np.max(map(float, history[history.Name == skater]['Short Score']))
                          for skater in history.Name.get_values()}
len(individual_bests_short)

300

In [105]:
have_frees = history[history['Free Rank'] != 'DNQ']
have_frees = have_frees[have_frees['Free Rank'] != 'WD']
individual_bests_free = {skater: np.max(map(float, have_frees[have_frees.Name == skater]['Free Score']))
                         for skater in have_frees.Name.get_values()}
len(individual_bests_free)

238

In [106]:
med_short = np.median(individual_bests_short.values())
med_free = np.median(individual_bests_free.values())
med_short, med_free

(61.090000000000003, 124.88)

In [107]:
men['Short Best'] = men.apply(lambda row: med_short if row.Name not in individual_bests_short else individual_bests_short[row.Name], axis=1)
men['Free Best'] = men.apply(lambda row: med_free if row.Name not in individual_bests_free else individual_bests_free[row.Name], axis=1)

In [108]:
men.head()

Unnamed: 0.1,Unnamed: 0,Rank,Event,Date,Name,Nation,Points,Short Rank,Short Score,Free Rank,Free Score,Short Start,Free Start,Num Short Scorecards,Num Free Scorecards,Short Best,Free Best
0,0,1,gprus2017,2017-10-22,Nathan CHEN,USA,293.79,1,100.54,2,193.25,10,12.0,12,12,103.12,204.34
1,1,2,gprus2017,2017-10-22,Yuzuru HANYU,JPN,290.77,2,94.85,1,195.92,12,11.0,12,12,110.95,223.2
2,2,3,gprus2017,2017-10-22,Mikhail KOLYADA,RUS,271.06,4,85.79,3,185.27,11,8.0,12,12,93.28,178.31
3,3,4,gprus2017,2017-10-22,Misha GE,UZB,255.33,5,85.02,4,170.31,6,7.0,12,12,82.25,163.54
4,4,5,gprus2017,2017-10-22,Moris KVITELASHVILI,GEO,250.26,8,80.67,5,169.59,2,5.0,12,12,76.85,162.9


In [109]:
def predict_short(row):
    prediction = 20.8236
    best = med_short
    if row.Name in individual_bests_short:
        best = individual_bests_short[row.Name]
    prediction += 0.6155 * best
    return prediction + 12.3222 * float(row['Short Start']) / row['Num Short Scorecards']

def predict_free(row):
    if pd.isnull(row['Free Start']):
        return 0.
    prediction = 72.3915 
    best = med_free
    if row.Name in individual_bests_free:
        best = individual_bests_free[row.Name]
    prediction += 0.3983 * best
    return prediction + 35.2871 * float(row['Free Start']) / row['Num Free Scorecards']

In [110]:
men['pred_ols_short'] = men.apply(predict_short, axis=1)

In [111]:
men['pred_ols_free'] = men.apply(predict_free, axis=1)

In [112]:
ols_comparison = []
for event in season2018.events:
    competition = men[men.Event == event.name]
    predictions = {}
    outcomes = {}
    for index, row in competition.iterrows():
        predictions[row.Name] = row.pred_ols_short + row.pred_ols_free
    for index, row in competition.iterrows():
        outcomes[row.Name] = row.Points
    ols_comparison.append((predictions, outcomes))

In [118]:
score_difference_ols, rank_misses_ols, rank_mistake_ols = calculate_loss(ols_comparison)
score_differences['ols'] = score_difference_ols
rank_misses['ols'] = rank_misses_ols
rank_mistakes['ols'] = rank_mistake_ols

In [115]:
score_differences

{'ols': 50237.52214114873,
 'partial_pool_prediction': 91053.292305282477,
 'pooled_prediction': 204946.19175774002,
 'unpooled_prediction': 140457656.8769297}

In [45]:
rank_mistakes

{'ols': 308,
 'partial_pool_prediction': 302,
 'pooled_prediction': 376,
 'unpooled_prediction': 380}

In [169]:
def get_comparison_combined(df):
    comparison = []
    comparison_programs = []
    for event in season2018.events:
        # Predictions for this event.
        predictions = {}; outcomes = {}
        short_pred = {}; short_outcome = {}
        free_pred = {}; free_outcome = {}
        
        event_name = event.name
        short = df[df.segment == event_name + ' men_short']
        free = df[df.segment == event_name + ' men_free']
        
        # Predict elements and components
        short_elts = short[short.is_elt == 1].groupby('skater').prediction.sum()
        free_elts = free[free.is_elt == 1].groupby('skater').prediction.sum()
        comp_short = short[short.is_elt == 0].groupby('skater').prediction.sum()
        comp_free = 2 * free[free.is_elt == 0].groupby('skater').prediction.sum()

        # Predict total scores.
        short_score = short_elts + comp_short
        free_score = free_elts + comp_free
        total_score = short_score + free_score
        for skater in total_score.index:
            predictions[skater] = short_score.ix[skater]
            short_pred[skater] = short_score.ix[skater]
            if skater in free_score.index:
                predictions[skater] += free_score.ix[skater]
                free_pred[skater] = free_score.ix[skater]
        for index, row in men[men.Event == event_name][['Name', 'Points', 'Short Score', 'Free Score']].iterrows():
            outcomes[row.Name] = row.Points
            short_outcome[row.Name] = row['Short Score']
            if not pd.isnull(row['Free Score']):
                free_outcome[row.Name] = row['Free Score']
        comparison.append((predictions, outcomes))
        comparison_programs.append((short_pred, short_outcome, free_pred, free_outcome))
    return comparison, comparison_programs

In [161]:
score_differences['prediction2'], rank_mistakes['prediction2'], rank_misses['prediction2'] = calculate_loss(get_comparison_combined(predictions2)[0])

In [170]:
get_comparison_combined(predictions2)[1]

[({'Andrei LAZUKIN': 63.671315225140006,
   'Daniel SAMOHIN': 70.449969969609995,
   'Denis TEN': 80.448933857850008,
   'Deniss VASILJEVS': 74.894249014769997,
   'Dmitri ALIEV': 62.77371994157,
   'Grant HOCHSTEIN': 74.054346166889999,
   'Mikhail KOLYADA': 83.997503447059984,
   'Misha GE': 74.459112829250003,
   'Moris KVITELASHVILI': 75.45644624082,
   'Nam NGUYEN': 78.579096597190002,
   'Nathan CHEN': 91.440426214209992,
   'Yuzuru HANYU': 94.74895394987},
  {'Andrei LAZUKIN': 78.54,
   'Daniel SAMOHIN': 62.02,
   'Denis TEN': 69.0,
   'Deniss VASILJEVS': 82.44,
   'Dmitri ALIEV': 88.77,
   'Grant HOCHSTEIN': 67.56,
   'Mikhail KOLYADA': 85.79,
   'Misha GE': 85.02,
   'Moris KVITELASHVILI': 80.67,
   'Nam NGUYEN': 80.74,
   'Nathan CHEN': 100.54,
   'Yuzuru HANYU': 94.85},
  {'Andrei LAZUKIN': 128.59292752707,
   'Daniel SAMOHIN': 135.62300495433101,
   'Denis TEN': 150.81385262985799,
   'Deniss VASILJEVS': 150.07281262237001,
   'Dmitri ALIEV': 134.32823151555002,
   'Grant H

In [173]:
comps = predictions2[predictions2.is_elt == 0]
len(comps[comps.points > comps.prediction]), len(comps)

(981, 1320)

In [178]:
fancy_short_pred = {men.Event.unique()[i]: (get_comparison_combined(predictions2)[1][i][0], get_comparison_combined(predictions2)[1][i][1]) for i in xrange(9)}

In [181]:
compare = men[['pred_ols_short', 'Name', 'Event']].copy()

In [183]:
compare['pred2_short'] = compare.apply(lambda row: fancy_short_pred[row.Event][0][row.Name], axis=1)

In [184]:
compare['real_short'] = compare.apply(lambda row: fancy_short_pred[row.Event][1][row.Name], axis=1)

In [187]:
sum(map(lambda x: x * x, compare['pred_ols_short'] - compare['pred2_short']))

5694.2237378707268

In [188]:
compare

Unnamed: 0,pred_ols_short,Name,Event,pred2_short,real_short
0,94.562460,Nathan CHEN,gprus2017,91.440426,100.54
1,101.435525,Yuzuru HANYU,gprus2017,94.748954,94.85
2,89.532790,Mikhail KOLYADA,gprus2017,83.997503,85.79
3,77.609575,Misha GE,gprus2017,74.459113,85.02
4,70.178475,Moris KVITELASHVILI,gprus2017,75.456446,80.67
5,66.639295,Dmitri ALIEV,gprus2017,62.773720,88.77
6,73.937110,Nam NGUYEN,gprus2017,78.579097,80.74
7,78.316365,Deniss VASILJEVS,gprus2017,74.894249,82.44
8,83.983105,Denis TEN,gprus2017,80.448934,69.00
9,59.451345,Andrei LAZUKIN,gprus2017,63.671315,78.54


In [191]:
predictions2[predictions2.skater == 'Shoma UNO']

Unnamed: 0.1,Unnamed: 0,element,elt_type,event,is_elt,points,prediction,segment,skater,skater_code
240,240,4F,4j,gpcan2017,1,14.59,11.294323,gpcan2017 men_short,Shoma UNO,253
241,241,FCSp4,sp,gpcan2017,1,4.20,4.069687,gpcan2017 men_short,Shoma UNO,253
242,242,StSq3,st,gpcan2017,1,4.44,4.940551,gpcan2017 men_short,Shoma UNO,253
243,243,4T+2T,4j,gpcan2017,1,13.76,10.970014,gpcan2017 men_short,Shoma UNO,253
244,244,3A,3j,gpcan2017,1,11.78,9.069428,gpcan2017 men_short,Shoma UNO,253
245,245,CSSp4,sp,gpcan2017,1,3.71,4.010456,gpcan2017 men_short,Shoma UNO,253
246,246,CCoSp4,sp,gpcan2017,1,4.64,4.061950,gpcan2017 men_short,Shoma UNO,253
324,324,4Lo,4j,gpcan2017,1,14.14,11.108533,gpcan2017 men_free,Shoma UNO,253
325,325,3Lo,3j,gpcan2017,1,4.60,9.120333,gpcan2017 men_free,Shoma UNO,253
326,326,3A,3j,gpcan2017,1,10.50,9.073478,gpcan2017 men_free,Shoma UNO,253
