In [53]:
import json
import numpy as np
import pandas as pd
import pickle
from scipy.stats import binom, norm

In [2]:
with open('seasons.pickle') as f:
    seasons = pickle.load(f)
for year, season in seasons.iteritems():
    print year,
    season.load_scores()

2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018


In [7]:
with open('pd_data/judge_nations.json') as f:
    nations = json.load(f)

In [4]:
with open('pd_data/name_fixes_all.json') as f:
    all_name_fixes = pickle.load(f)
def remove_mr_ms(judge):
    judge = judge.replace(r'\xa0', '')
    judge = judge.replace('Mr. ', '')
    judge = judge.replace('Mr ', '')
    judge = judge.replace('Ms. ', '')
    judge = judge.replace('Ms ', '')
    judge = judge.replace('Mrs. ', '')
    return judge

In [16]:
def calculate_trimmed_mean(scores):
    num_scores = len(scores)
    return (sum(scores) - min(scores) - max(scores)) / (num_scores - 2.)

In [17]:
elt_df_list = []
comp_df_list = []
for season in (seasons['2017'], seasons['2018']):
    for event in season.events:
        for discipline in event.disciplines:
            for segment in discipline.segments:
                for scorecard in segment.scorecards:
                    skater = all_name_fixes.get(scorecard.skater.name, scorecard.skater.name)
                    judges = [all_name_fixes.get(remove_mr_ms(judge.name), remove_mr_ms(judge.name))
                              for judge in segment.panel.judges if judge.name != '-']
                    for element in scorecard.elements:
                        med_goe = np.median(element.parsed_goes)
                        trimmed_mean = calculate_trimmed_mean(element.parsed_goes)
                        for i, judge in enumerate(judges):
                            judge_num = i+1
                            elt_df_list.append({
                                'skater': skater,
                                'country': scorecard.skater.country,
                                'segment_name': event.name + '_' + segment.name,
                                'season': season.champ_year,
                                'discipline': discipline.discipline.name,
                                'segment_rank': scorecard.rank,
                                'start_order': scorecard.starting_number,
                                'elt_number': element.number,
                                'elt_name': element.name,
                                'elt_info': element.info,
                                'elt_bv': element.base_value,
                                'elt_bonus': element.bonus,
                                'elt_score': element.goe,
                                'elt_points': element.points,
                                'judge': judge,
                                'judge_num': judge_num,
                                'judge_score': element.parsed_goes[i],
                                'med_score': med_goe,
                                'trimmed_mean': trimmed_mean,
                                'is_comp': 0
                            })
                    for comp in scorecard.components:
                        med_score = np.median(comp.parsed_scores)
                        trimmed_mean = calculate_trimmed_mean(comp.parsed_scores)
                        for i, judge in enumerate(judges):
                            judge_num = i+1
                            comp_df_list.append({
                                'skater': skater,
                                'country': scorecard.skater.country,
                                'segment_name': event.name + '_' + segment.name,
                                'season': season.champ_year,
                                'discipline': discipline.discipline.name,
                                'segment_rank': scorecard.rank,
                                'start_order': scorecard.starting_number,
                                'elt_name': all_name_fixes.get(comp.name, comp.name),
                                'elt_score': comp.points,
                                'judge': judge,
                                'judge_num': judge_num,
                                'judge_score': comp.parsed_scores[i],
                                'med_score': med_score,
                                'trimmed_mean': trimmed_mean,
                                'is_comp': 1
                            })

In [18]:
elts_df = pd.DataFrame(elt_df_list)
comp_df = pd.DataFrame(comp_df_list)

In [21]:
scores = pd.concat([elts_df, comp_df])

In [23]:
scores['judge_country'] = scores.apply(lambda row: nations[row.judge], axis=1)

In [32]:
scores['same_country'] = scores.apply(lambda row: 1 if row.country == row.judge_country else 0, axis=1)

In [33]:
scores['is_larger'] = scores.apply(lambda row: 1 if row.judge_score > row.med_score else 0, axis=1)

In [58]:
scores['dist_med'] = scores.apply(lambda row: row.judge_score - row.med_score, axis=1)

In [59]:
scores_not_med = scores[scores.judge_score != scores.med_score]
scores_med     = scores[scores.judge_score == scores.med_score]

In [27]:
def critical_values(n, cv=1.96):
    lower = -cv * np.sqrt(n*0.25) + 0.5*n + 0.5
    upper =  cv * np.sqrt(n*0.25) + 0.5*n - 0.5
    return (lower, upper)

In [64]:
scores_not_med['judge_n'] = scores_not_med.apply(lambda row: 1 if row.country == row.judge_country else -1, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [65]:
scores_not_med['u'] = scores_not_med.apply(lambda row: 1 if row.judge_n * row.dist_med > 0 else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [67]:
scores_not_med['country_high_low'] = scores_not_med.apply(lambda row: 1 if row.is_larger == row.same_country else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [68]:
sum(scores_not_med.country_high_low), sum(scores_not_med.u)

(61106, 61106)

In [88]:
len(scores_not_med), len(scores), len(scores_med)

(108447, 247278, 138831)

In [93]:
n = len(scores_not_med)
1-binom.cdf(61106, n, 0.5), 61106./108447

(1.1102230246251565e-16, 0.5634641806596771)

In [101]:
relscores = scores_not_med[scores_not_med.segment_rank]
s = sum(relscores.country_high_low)
n = len(relscores)
1-binom.cdf(s, n, 0.5), float(s)/n, s, n

(0.00025132609080258383, 0.5175411680474583, 5060, 9777)

In [90]:
for description, is_comp, segment_name in (('short elts', 0, 'short'), ('short comp', 1, 'short'),
                                           ('free elts', 0, 'free'), ('free comp', 1, 'free')):
    print description
    relscores = scores_not_med[scores_not_med.is_comp == is_comp]
    relscores = relscores[relscores.segment_name.str.contains(segment_name)]
    N = len(relscores)
    S = sum(relscores.country_high_low)
    print S, N, '%.20f' % (1 - binom.cdf(S, N, 0.5)), float(S) / N
    relscores = relscores[relscores.same_country == 1]
    N = len(relscores)
    S = sum(relscores.country_high_low)
    print S, N, '%.20f' % (1 - binom.cdf(S, N, 0.5)), float(S) / N

 short elts
11069 19841 0.00000000000000011102 0.557885187239
1321 1672 0.00000000000000011102 0.790071770335
short comp
17744 31391 0.00000000000000011102 0.565257557899
2367 2772 0.00000000000000011102 0.853896103896
free elts
17140 30704 0.00000000000000011102 0.558233454924
2151 2763 0.00000000000000011102 0.778501628664
free comp
15153 26511 0.00000000000000011102 0.571574063596
2147 2532 0.00000000000000011102 0.84794628752


In [102]:
for discipline in scores_not_med.discipline.unique():
    print discipline
    relscores = scores_not_med[scores_not_med.discipline == discipline]
    relscores = relscores[relscores.same_country == 1]
    N = len(relscores)
    S = sum(relscores.country_high_low)
    print S, N, '%.20f' % (1-binom.cdf(S, N, 0.5)), float(S) / N
    relscores = relscores[relscores.segment_rank <= 6]
    N = len(relscores)
    S = sum(relscores.country_high_low)
    print S, N, '%.20f' % binom.cdf(S, N, 0.5), float(S) / N

men
2206 2793 0.00000000000000011102 0.789831722163
1100 1344 0.99999999999999988898 0.818452380952
ladies
2043 2544 0.00000000000000011102 0.803066037736
1066 1294 0.99999999999999988898 0.823802163833
pairs
1685 2111 0.00000000000000011102 0.798199905258
1241 1478 0.99999999999999988898 0.839648173207
ice_dance
2052 2291 0.00000000000000011102 0.895678742907
1110 1194 0.99999999999999988898 0.929648241206


In [42]:
critical_values(n), n * 0.5

((53901.273337327082, 54545.726662672918), 54223.5)

In [39]:
len(scores_not_med), len(scores_med)

(108447, 138831)

In [173]:
scores_low = {}
scores_high = {}
for judge in elts_rus.judge.unique():
    elts = elts_rus[elts_rus.judge == judge]
    comp = comp_rus[comp_rus.judge == judge]
    n = len(elts) + len(comp)
    lower, upper = critical_values(n, 3.646383)
    obs = sum(elts.is_larger) + sum(comp.is_larger)
    if obs <= lower:
        scores_low[judge] = norm.cdf(obs, loc=n*0.5, scale=np.sqrt(n*0.25))
    elif obs >= upper:
        scores_high[judge] = 1 - norm.cdf(obs, loc=n*0.5, scale=np.sqrt(n*0.25))

In [174]:
len(elts_rus.judge.unique()), len(scores_low), len(scores_high)

(188, 30, 26)

In [168]:
len(elts_rus.judge.unique()), len(scores_low), len(scores_high)

(220, 48, 34)

In [193]:
nations['Virpi KUNNAS-HELMINEN'] = 'FIN'
nations['Vessela POPOVA'] = 'BGR'
nations['Yuko OGAWA'] = 'JPN'
nations['Kvetoslava MATEJOVA'] = 'SVK'
nations['Vladimir CUCHRAN'] = 'SVK'
nations['Thomas BIEGLER'] = 'AUT'
nations['Yoko KUNO'] = 'JPN'
nations['Rebecca ANDREW'] = 'AUS'
nations['Hege ROSTO JENSEN'] = 'NOR'
nations['Igor PROKOP'] = 'SVK'
nations['Senem AHISKAL'] = 'TUR'
nations['Margaret WORSFOLD'] = 'GBR'
nations['Ann FINDLAY'] = 'GBR'
nations['Elisabeth BINDER'] = 'AUT'
nations['Elizabeth ALEXANDRE'] = 'AUS'
nations['Tetsuo ABE'] = 'JPN'
nations['Deborah NOYES'] = 'AUS'
nations['Magdalena RUSIECKA'] = 'POL'
nations['Hanna THEN'] = 'POL'

In [195]:
nations['Simona SPALLUTO'] = 'ITA'
nations['Agnieszka SWIDERSKA'] = 'POL'
nations['Osman SIRVAN'] = 'TUR'
nations['Ariadna MORONES NEGRETE'] = 'MEX'
nations['Susan PETRICEVIC'] = 'NZL'
nations['Mona ADOLFSEN'] = 'NOR'
nations['Darja GABROVSEK-POLAJNAR'] = 'SLO'
nations['Nadezhda FIODOROVA'] = 'BLR'
nations['Alison RYAN'] = 'AUS'
nations['Donatella LEONELLI'] = 'SUI'
nations['Adrienn SCHADENBAUER'] = 'AUT'
nations['Neil GARRARD'] = 'ZAF'
nations['Ernestien BAKKER'] = 'NED'
nations['Joanna MILLER'] = 'AUS'
nations['Ebru ANILDI'] = 'TUR'
nations['Edith SCHILLER'] = 'AUT'
nations['Ekaterina SEROVA'] = 'BLR'
nations['Ko-Man TING'] = 'TWN'
nations['Jitka MOKRA'] = 'CZE'
nations['Etsuko AZUMA'] = 'JPN'

In [197]:
nations['Tae Ri LEE'] = 'KOR'
nations['Akiko KOBAYASHI'] = 'JPN'
nations['Ece ESEN'] = 'TUR'
nations['Kersten BELLMANN'] = 'GER'
nations['Roberts KRUTKRAMELIS'] = 'LAT'
nations['Mary CHAPMAN'] = 'GBR'
nations['Josip CEROVAC'] = 'CRO'

In [199]:
with open('pd_data/judge_nations.json', 'w') as f:
    json.dump(nations, f)

In [109]:
sum(elts_not_med[elts_not_med.judge == 'Weiguang CHEN'].is_larger)

279

In [110]:
len(elts_not_med[elts_not_med.judge == 'Weiguang CHEN'])

629

In [70]:
panels = {}  # map segment name to judging panel
judge_names = set()
for season in seasons.values():
    for event in season.events:
        for discipline in event.disciplines:
            for segment in discipline.segments:
                if 'Function' in segment.panel.officials:
                    del segment.panel.officials['Function']
                    print event, segment
                panel = {}
                for function, official in segment.panel.officials.iteritems():
                    name = remove_mr_ms(official.name)
                    name = all_name_fixes.get(name, name)
                    panel[function] = name
                    if len(name) < 5:
                        print event, segment, name
                    if 'Judge' in official.function and season.champ_year in (2017, 2018):
                        judge_names.add(name)
                panels[event.name + '_' + segment.name] = panel

gpusa2016 gpusa2016 pairs_short -


In [71]:
judge_names.remove('-')

In [72]:
judges = {k: ','.join([v['Judge No.{0}'.format(judge_num+1)] for judge_num in xrange(len(v)-6)])
          for k, v in panels.iteritems()}

In [73]:
elts_df['judges'] = elts_df.apply(lambda row: judges[row.segment_name], axis=1)
comp_df['judges'] = comp_df.apply(lambda row: judges[row.segment_name], axis=1)

In [74]:
elts_df['elt_goes'] = elts_df.apply(lambda row: row.elt_goes if row.segment_name != 'gpusa2016_pairs_short'
              else ','.join([goe for goe in row.elt_goes.split(',') if goe != '-']), axis=1)

In [75]:
comp_df['comp_scores'] = comp_df.apply(lambda row: row.comp_scores if row.segment_name != 'gpusa2016_pairs_short'
              else ','.join([goe for goe in row.comp_scores.split(',') if goe != '-']), axis=1)

In [76]:
elts_df['median_goe'] = elts_df.apply(lambda row: np.median(map(float, row.elt_goes.split(','))), axis=1)

In [77]:
comp_df['median_score'] = comp_df.apply(lambda row: np.median(map(float, row.comp_scores.split(','))), axis=1)

In [78]:
elts_df

Unnamed: 0,country,discipline,elt_bonus,elt_bv,elt_finalgoe,elt_goes,elt_info,elt_name,elt_number,elt_points,season,segment_name,segment_rank,skater,start_order,judges,median_goe
0,JPN,men,False,12.30,-0.51,"-1.0,0.0,0.0,-1.0,-1.0,0.0,1.0,-1.0,0.0",,4F,1,11.79,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",0.0
1,JPN,men,False,13.30,-4.00,"-3.0,-3.0,-3.0,-3.0,-3.0,-2.0,-3.0,-3.0,-3.0",<,4T+3T<,2,9.30,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",-3.0
2,JPN,men,False,3.20,0.86,"1.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0",,FCSp4,3,4.06,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",2.0
3,JPN,men,False,3.00,0.64,"1.0,1.0,1.0,1.0,0.0,2.0,1.0,2.0,2.0",,CSSp4,4,3.64,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",1.0
4,JPN,men,True,9.35,-0.14,"1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0",,3A,5,9.21,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",0.0
5,JPN,men,False,3.30,0.93,"2.0,1.0,2.0,2.0,1.0,3.0,2.0,2.0,2.0",,StSq3,6,4.23,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",2.0
6,JPN,men,False,3.50,1.14,"2.0,1.0,3.0,3.0,1.0,2.0,2.0,3.0,3.0",,CCoSp4,7,4.64,2017,gpusa2016_men_short,1,Shoma UNO,9,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",2.0
7,USA,men,False,9.60,0.90,"2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,2.0",,3F+3T,1,10.50,2017,gpusa2016_men_short,2,Adam RIPPON,10,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",1.0
8,USA,men,False,8.50,1.29,"1.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,2.0",,3A,2,9.79,2017,gpusa2016_men_short,2,Adam RIPPON,10,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",1.0
9,USA,men,False,3.50,1.07,"2.0,1.0,3.0,2.0,0.0,3.0,2.0,2.0,3.0",,CCoSp4,3,4.57,2017,gpusa2016_men_short,2,Adam RIPPON,10,"Cynthia BENSON,Lisa JELINEK,Maira ABASOVA,Fran...",2.0


In [103]:
len(elts_df), len(comp_df)

(158173, 89105)

In [120]:
judge_not_med = {}
judge_med = {}
for judge in judge_names:
    obs = len(elts_not_med[elts_not_med.judge == judge])
    obs += len(comp_not_med[comp_not_med.judge == judge])
    judge_not_med[judge] = obs
    
    obs = len(elts_med[elts_med.judge == judge])
    obs += len(comp_med[comp_med.judge == judge])
    judge_med[judge] = obs

In [121]:
sorted(judge_not_med.iteritems(), key=lambda (k, v): v)

[(u'Helene CUCUPHAT', 66),
 (u'Jennifer MAST', 85),
 (u'Francis BETSCH', 86),
 (u'Rossella CECCATTINI', 96),
 (u'Yu WANG', 96),
 (u'Elena LISOVA', 97),
 (u'Szilard TOTH', 98),
 (u'Marketa HORKLOVA', 101),
 (u'Prisca BINZ-MOSER', 101),
 (u'Roberts KRUTKRAMELIS', 101),
 (u'Taffy HOLLIDAY', 110),
 (u'Ece ESEN', 112),
 (u'Rebecca ANDREW', 113),
 (u'Kristina LUNDGREN', 115),
 (u'Evgeni ROKHIN', 121),
 (u'Anja RIST', 123),
 (u'Simona SPALLUTO', 127),
 (u'Ingrid Charlotte WOLTER', 128),
 (u'Garry HOPPE', 131),
 (u'Vladimir CUCHRAN', 136),
 (u'Vladislav PETUKHOV', 137),
 (u'Kaoru TAKINO', 140),
 (u'Elizabeth ALEXANDRE', 140),
 (u'Deborah ISLAM', 146),
 (u'Natalia KRUGLOVA', 148),
 (u'Vessela POPOVA', 149),
 (u'Makoto KANO', 149),
 (u'Tetsuo ABE', 149),
 (u'Tomie FUKUDOME', 155),
 (u'Xixia LIU', 159),
 (u'Darja GABROVSEK-POLAJNAR', 159),
 (u'Aigul KUANISHEVA', 160),
 (u'Guona ZHAO', 160),
 (u'Kvetoslava MATEJOVA', 162),
 (u'Walter ZUCCARO', 162),
 (u'Gloria MORANDI', 163),
 (u'Zuzana PLESNIKOVA