In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

In [2]:
with open('season2018.pickle') as f:
    season2018 = pickle.load(f)

In [32]:
elements_pred = pd.read_csv('fits/men_multi_elts_1.csv')
components_pred = pd.read_csv('fits/men_multi_comp_1.csv')

In [33]:
elements_pred.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,bonus,date,element,elt_type,event,goe,info,number,points,segment,segment_rank,skater,start_order,skater_code,partial_pool_prediction,pooled_prediction,unpooled_prediction
0,0,0,0,False,2017-10-22,4Lz+3T,4j,gprus2017,1.57,,1,19.47,gprus2017 men_short,1,Nathan CHEN,10,205,12.647146,9.578746,13.102899
1,1,1,1,False,2017-10-22,FSSp4,sp,gprus2017,0.86,,2,3.86,gprus2017 men_short,1,Nathan CHEN,10,205,3.773436,3.062927,3.78773
2,2,2,2,False,2017-10-22,CCSp3,sp,gprus2017,0.21,,3,3.01,gprus2017 men_short,1,Nathan CHEN,10,205,3.734792,3.062927,3.78773
3,3,3,3,True,2017-10-22,4F,4j,gprus2017,-1.03,,4,12.5,gprus2017 men_short,1,Nathan CHEN,10,205,12.459942,9.578746,13.102899
4,4,4,4,True,2017-10-22,3A,3j,gprus2017,-0.86,,5,8.49,gprus2017 men_short,1,Nathan CHEN,10,205,8.157336,6.923761,8.681645


In [7]:
# 2018 data
men = pd.read_csv('pd_data/results18_men.csv')
ladies = pd.read_csv('pd_data/results18_ladies.csv')
pairs = pd.read_csv('pd_data/results18_pairs.csv')
dance = pd.read_csv('pd_data/results18_dance.csv')

In [8]:
with open('season2018.pickle') as f:
    season18 = pickle.load(f)
season18.load_scores()

In [9]:
history = pd.read_csv('pd_data/results_nowd_nofra15_men.csv')

In [36]:
def get_comparison(estimate_type):
    comparison = []
    for event in season2018.events:
        event_name = event.name
        elts = elements_pred[elements_pred.event == event_name].groupby('skater')[estimate_type].sum()
        comp_short = components_pred[components_pred.segment == event_name + ' men_short'].groupby('skater')[estimate_type + 's'].sum()
        comp_free = 2 * components_pred[components_pred.segment == event_name + ' men_free'].groupby('skater')[estimate_type + 's'].sum()
        predictions = {}
        outcomes = {}
        total_score = elts + comp_short
        for skater in total_score.index:
            predictions[skater] = total_score.ix[skater]
            if skater in comp_free.index:
                predictions[skater] += comp_free.ix[skater]
        for index, row in men[men.Event == event_name][['Name', 'Points']].iterrows():
            outcomes[row.Name] = row.Points
        comparison.append((predictions, outcomes))
    return comparison

In [11]:
def get_ranks(result_dict):
    ranks = {}
    for (i, (skater, _)) in enumerate(sorted(result_dict.iteritems(), key=lambda (k,v): -1 * v)):
        ranks[skater] = i+1
    return ranks

In [40]:
score_differences = {}
rank_mistakes = {}
rank_misses = {}
for estimate_type in ('pooled_prediction', 'unpooled_prediction', 'partial_pool_prediction'):
    comparison = get_comparison(estimate_type)
    score_difference = 0.
    rank_mistake = 0
    rank_miss = {}
    for i, (predictions, outcomes) in enumerate(comparison):
        for skater in predictions:
            score_difference += (predictions[skater] - outcomes[skater])**2
            if pd.isnull(score_difference):
                print i, skater
        predicted_ranks = get_ranks(predictions)
        actual_ranks = get_ranks(outcomes)
        for skater in predictions:
            diff = abs(predicted_ranks[skater] - actual_ranks[skater])
            if diff:
                if skater not in rank_miss:
                    rank_miss[skater] = []
                rank_miss[skater].append((predicted_ranks[skater], actual_ranks[skater]))
                rank_mistake += diff
    score_differences[estimate_type] = score_difference
    rank_mistakes[estimate_type] = rank_mistake
    rank_misses[estimate_type] = rank_miss

In [42]:
rank_mistakes, score_differences

({'partial_pool_prediction': 302,
  'pooled_prediction': 376,
  'unpooled_prediction': 380},
 {'partial_pool_prediction': 91053.292305282477,
  'pooled_prediction': 204946.19175774002,
  'unpooled_prediction': 140457656.8769297})

In [15]:
with open('fits/men_reputation_start_pred.csv') as f:
    short_pred, free_pred = pickle.load(f)

In [16]:
men.Event.unique()

array(['gprus2017', 'gpcan2017', 'gpchn2017', 'gpjpn2017', 'gpfra2017',
       'gpusa2017', 'gpf1718', 'ec2018', 'fc2018'], dtype=object)

In [17]:
individual_bests_short = {skater: np.max(map(float, history[history.Name == skater]['Short Score']))
                          for skater in history.Name.get_values()}
len(individual_bests_short)

300

In [18]:
have_frees = history[history['Free Rank'] != 'DNQ']
have_frees = have_frees[have_frees['Free Rank'] != 'WD']
individual_bests_free = {skater: np.max(map(float, have_frees[have_frees.Name == skater]['Free Score']))
                         for skater in have_frees.Name.get_values()}
len(individual_bests_free)

238

In [19]:
med_short = np.median(individual_bests_short.values())
med_free = np.median(individual_bests_free.values())
med_short, med_free

(61.090000000000003, 124.88)

In [20]:
men['Short Best'] = men.apply(lambda row: med_short if row.Name not in individual_bests_short else individual_bests_short[row.Name], axis=1)
men['Free Best'] = men.apply(lambda row: med_free if row.Name not in individual_bests_free else individual_bests_free[row.Name], axis=1)

In [21]:
men.head()

Unnamed: 0.1,Unnamed: 0,Rank,Event,Date,Name,Nation,Points,Short Rank,Short Score,Free Rank,Free Score,Short Start,Free Start,Num Short Scorecards,Num Free Scorecards,Short Best,Free Best
0,0,1,gprus2017,2017-10-22,Nathan CHEN,USA,293.79,1,100.54,2,193.25,10,12.0,12,12,103.12,204.34
1,1,2,gprus2017,2017-10-22,Yuzuru HANYU,JPN,290.77,2,94.85,1,195.92,12,11.0,12,12,110.95,223.2
2,2,3,gprus2017,2017-10-22,Mikhail KOLYADA,RUS,271.06,4,85.79,3,185.27,11,8.0,12,12,93.28,178.31
3,3,4,gprus2017,2017-10-22,Misha GE,UZB,255.33,5,85.02,4,170.31,6,7.0,12,12,82.25,163.54
4,4,5,gprus2017,2017-10-22,Moris KVITELASHVILI,GEO,250.26,8,80.67,5,169.59,2,5.0,12,12,76.85,162.9


In [22]:
def predict_short(row):
    prediction = 20.8236
    best = med_short
    if row.Name in individual_bests_short:
        best = individual_bests_short[row.Name]
    prediction += 0.6155 * best
    return prediction + 12.3222 * float(row['Short Start']) / row['Num Short Scorecards']

def predict_free(row):
    if pd.isnull(row['Free Start']):
        return 0.
    prediction = 72.3915 
    best = med_free
    if row.Name in individual_bests_free:
        best = individual_bests_free[row.Name]
    prediction += 0.3983 * best
    return prediction + 35.2871 * float(row['Free Start']) / row['Num Free Scorecards']

In [23]:
men['pred_ols_short'] = men.apply(predict_short, axis=1)

In [24]:
men['pred_ols_free'] = men.apply(predict_free, axis=1)

In [25]:
ols_comparison = []
for event in season2018.events:
    competition = men[men.Event == event.name]
    predictions = {}
    outcomes = {}
    for index, row in competition.iterrows():
        predictions[row.Name] = row.pred_ols_short + row.pred_ols_free
    for index, row in competition.iterrows():
        outcomes[row.Name] = row.Points
    ols_comparison.append((predictions, outcomes))

In [26]:
score_difference_ols = 0.
rank_mistake_ols = 0
rank_misses_ols = {}
for i, (predictions, outcomes) in enumerate(ols_comparison):
    for skater in predictions:
        score_difference_ols += (predictions[skater] - outcomes[skater])**2
        if pd.isnull(score_difference_ols):
            print skater, i
    predicted_ranks = get_ranks(predictions)
    actual_ranks = get_ranks(outcomes)
    for skater in predictions:
        diff = abs(predicted_ranks[skater] - actual_ranks[skater])
        if diff:
            if skater not in rank_misses_ols:
                rank_misses_ols[skater] = []
            rank_misses_ols[skater].append((predicted_ranks[skater], actual_ranks[skater]))
            rank_mistake_ols += diff

In [43]:
score_differences['ols'] = score_difference_ols
rank_misses['ols'] = rank_misses_ols
rank_mistakes['ols'] = rank_mistake_ols

In [44]:
score_differences

{'ols': 50237.52214114873,
 'partial_pool_prediction': 91053.292305282477,
 'pooled_prediction': 204946.19175774002,
 'unpooled_prediction': 140457656.8769297}

In [45]:
rank_mistakes

{'ols': 308,
 'partial_pool_prediction': 302,
 'pooled_prediction': 376,
 'unpooled_prediction': 380}