# Preliminary Analysis/Plotting

In [None]:
from os.path import join
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

from bokeh.plotting import figure, show
from bokeh.layouts import column
from bokeh.io import output_notebook, curdoc
from bokeh.models import ColumnDataSource, Span, FactorRange
from bokeh.transform import jitter
from bokeh.palettes import *

output_notebook()
curdoc().theme = 'light_minimal'

### Load Data

I have all the data locally stored in compressed tabular format (parquet), only about 30 MB on disk. The data include all results for the Games and at most the top 2500 people in the Open.

In [None]:
xft = pd.read_parquet(join('..', 'data', 'pro', 'cleaned.parquet'))
xft['compDiv'] = pd.Categorical(xft.competitionType.str.title() + '—' + xft.divisionName.str.title())
xft = xft[xft.workoutName != 'Stage 1 Points']
xft.head(5)

In [None]:
#colors for plots categorizing only by these Comp-Gender combos
compDiv2color = {
    'Games—Men': Paired8[1],
    'Games—Women': Paired8[5],
    'Open—Men': Paired8[3],
    'Open—Women': Paired8[7]
}
#functions for computing alpha and size values for markers, from p values
psize = lambda p: 10 - 6*np.power(p, 1/3)
palpha = lambda p: 1 - (3/4)*np.power(p, 1/3)

### Rank Correlations for Individual Workouts

I compute [Spearman's rank correlation](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) between height and workout placement for every workout in the Open and Games, split up by division/gender. I ignore workouts with 5 or fewer althetes, which means that the second stage of the 2020 games has been dropped. The statistics are not very meaningful with so few people.

In [None]:
#grouping columns
gcols = [
    'year',
    'competitionType',
    'divisionName',
    'workoutName',
    'workoutNumber',
    'compDiv'
]

#emptly lists to accumulate metrics
wc = dict(zip(gcols, [[] for _ in range(len(gcols))]))
for col in ('c', 'p', 'N'):
    wc[col] = []

#rank correlations by workout
for k, group in xft[xft.divisionNumber < 3].groupby(gcols):
    group = group[['height', 'workoutRank']].dropna()
    N = len(group)
    if N > 5:
        c, p = spearmanr(group.height.values, group.workoutRank.values)
        for i,col in enumerate(gcols):
            wc[col].append( k[i] )
        wc['c'].append( c )
        wc['p'].append( p )
        wc['N'].append( N )

#construct new DataFrame and include size, transparency, color 
wc = pd.DataFrame(wc)
wc['compDivColor'] = wc.compDiv.map(compDiv2color)
wc['psize'] = psize(wc['p'])
wc['palpha'] = palpha(wc['p'])

In [None]:
p = figure(
    title='Rank Correlation: Athlete Height & Workout Placement',
    y_axis_label='Spearman Rank Correlation',
    height=400,
    width=600,
    x_range=sorted(wc.compDiv.unique()),
    active_scroll='wheel_zoom',
    active_drag='pan',
    tooltips=[
        ('year', '@year'),
        ('workout', '@workoutName'),
        ('correlation', '@c'),
        ('p-value', '@p'),
        ('N', '@N')
    ]
)
p.scatter(
    x=jitter('compDiv', width=0.6, range=p.x_range),
    y='c',
    fill_color='compDivColor',
    fill_alpha='palpha',
    line_color='black',
    line_alpha='palpha',
    size='psize',
    source=ColumnDataSource(wc)
)
span = Span(
    dimension='width',
    location=0,
    line_color='gray',
    line_width=2,
    line_alpha=0.75
)
p.add_layout(span)
p.y_range.flipped = True
span.level = 'underlay'
show(p)

This plot shows the rank correlations for all workouts, split up by gender and competition type (games/open). You can hover over dots to see which workouts they represent, scroll, zoom, and pan. Some things to point out:
* Zero on the vertical axis (gray line) indicates no preference for taller/shorter athletes.
* Negative values, higher on the plot, mean taller athletes finished with lower rank.
* Postive values, lower on the plot, mean shorter athletes finished with lower rank.
* The **size and transparency** of the markers relates to the [p-value](https://en.wikipedia.org/wiki/P-value) of each correlation. Small and semi-transparent dots indicate that the correlation could potentially have happened by chance, and doesn't mean very much.

In [None]:
figs = []
for K, df in wc.groupby('compDiv'):
    
    df = df.sort_values(['c'])
    source = dict(x=np.arange(len(df)))
    for col in ('c', 'p', 'year', 'workoutName', 'N', 'palpha', 'psize', 'compDivColor'):
        source[col] = df[col].values
    
    p = figure(
        title=f'Rank Correlation: Athlete Height & Workout Placement ({K})',
        y_axis_label='Spearman Rank Correlation',
        height=200,
        width=1800,
        tooltips=[
            ('year', '@year'),
            ('workout', '@workoutName'),
            ('correlation', '@c'),
            ('p-value', '@p'),
            ('N', '@N')
        ]
    )
    p.circle(
        x='x',
        y='c',
        fill_color='compDivColor',
        fill_alpha='palpha',
        line_color='black',
        line_alpha='palpha',
        size='psize',
        source=ColumnDataSource(source)
    )
    span = Span(
        dimension='width',
        location=0,
        line_color='gray',
        line_width=2,
        line_alpha=0.75
    )
    p.add_layout(span)
    p.y_range.flipped = True
    span.level = 'underlay'
    p.xaxis.major_label_text_font_size = '0pt' 
    figs.append(p)
figs = column(figs)
show(figs)


Here is the same information, presented differently. A sorted scatter of all workouts. A few things stand out, but hover your mouse over the dots to explore.
* Open workout 19.1, which was rowing and wall-balls, strongly favored tall athletes and is the most biased open workout on record.
* On the other side, Open workout 12.1, which I think was 7 minutes of burpees, strongly favored short people (no suprise).

By looking at where each set of dots crosses zero, we can see what portion of workouts favor short/tall athletes. For example
* For the Men's Games the line crosses right in the middle, which means about half the workouts favor short athletes and half favor tall athletes
* For the Women's Games, the line crosses further to the right. It looks like about 2/3 of Women's Games workouts favor *taller* athletes.
* The open is slightly tilted toward shorter athletes in both cases.

In [None]:
df = wc.sort_values(['year', 'compDiv'])
x = np.unique(list(df.year.astype(str) + ' ' + df.compDiv))
x = [tuple(f.split()) for f in x]
x = sorted(x, key=lambda t: t[0] + t[1][::-1])

p = figure(
    title='Rank Correlation: Athlete Height & Workout Placement',
    y_axis_label='Spearman Rank Correlation',
    height=500,
    width=1500,
    x_range=FactorRange(
        *x,
        group_padding=2
    ),
    active_drag='box_zoom',
    tooltips=[
        ('year', '@year'),
        ('workout', '@workoutName'),
        ('correlation', '@c'),
        ('p-value', '@p'),
        ('N', '@N')
    ]
)
for (year, compDiv), g in df.groupby(['year', 'compDiv']):
    #for duplicating x axis locations of all dots/objects
    L = len(g)
    k = (str(year), compDiv)
    #a source for each scatter sub-group
    source = dict(x=[k]*L)
    for col in ('c', 'p', 'year', 'workoutName', 'N', 'psize', 'palpha'):
        source[col] = g[col].values
    source['color'] = g.compDiv.map(compDiv2color)
    p.circle(
        x=jitter('x', width=0.5, range=p.x_range),
        y='c',
        fill_color='color',
        fill_alpha='palpha',
        line_color='black',
        line_alpha='palpha',
        size='psize',
        source=ColumnDataSource(source)
    )

span = Span(
    dimension='width',
    location=0,
    line_color='gray',
    line_width=2,
    line_alpha=0.75
)
p.add_layout(span)
span.level = 'underlay'
p.xaxis.major_label_orientation = np.pi/2
p.y_range.flipped = True
show(p)

This is another way to look at workout-level correlations. This time they've been grouped by year.

In [None]:
figs = []
for K, df in wc.groupby('compDiv'):

    df.sort_values(['year', 'workoutNumber'], inplace=True)
    source = dict(
        x=list(
            zip(
                df.year.astype(str),
                df.workoutNumber.astype(str)
            )
        )
    )
    for col in ('c', 'p', 'year', 'workoutName', 'N', 'palpha'):
        source[col] = df[col].values
    palette = Category10_7
    source['color'] = list(map(lambda i: palette[i % len(palette)], df.year))

    p = figure(
        title=f'Rank Correlation: Athlete Height & Workout Placement ({K})',
        y_axis_label='Spearman Rank Correlation',
        height=350,
        width=2000,
        x_range=FactorRange(
            *source['x'],
            group_padding=2
        ),
        tooltips=[
            ('year', '@year'),
            ('workout', '@workoutName'),
            ('correlation', '@c'),
            ('p-value', '@p'),
            ('N', '@N')
        ]
    )
    p.vbar(
        x='x',
        top='c',
        width='palpha',
        fill_color='color',
        fill_alpha='palpha',
        line_color='black',
        line_alpha='palpha',
        source=ColumnDataSource(source)
    )
    span = Span(
        dimension='width',
        location=0,
        line_color='gray',
        line_width=2,
        line_alpha=0.75
    )
    p.add_layout(span)
    span.level = 'underlay'
    p.xaxis.major_label_text_alpha = 0
    p.y_range.flipped = True
    figs.append(p)
figs = column(figs)
show(figs)

I find myself looking at this sequence of plots a lot. It shows the same correlations as vertical bars, split up by year and Games/Open. Here again, the thin and semi-transparent bars mean the correlation is not very statistically significant. There are potentially lots of observations to make. Looking at the Open a little bit:
* The 2022 Open had no strongly tilted workouts, although they all slightly favored short people.
* Some Open years like 2019 have strong correlations across workouts, sometimes in both directions, but generally favoring shorter athletes.
* We can look for repeat workouts, which should yield very similar correlations. For example, 14.2 and 15.2 are identical workouts and they do have similar coefficients (0.186 and 0.205). It's good that they're so close. I think there are other repeat workouts, but I don't know which ones.

The Games are similarly interesting.
* In the first stage of 2020, all Men's workouts favored short athletes except for the 1 km row, which strongly favored tall ones.
* The past two years of the Games (2021 & 2022) have been pretty favorable for tall people, it seems.
* It's also interesting to see Mary sticking out in 2019 with a very strong tilt toward short people, right before the cuts.
* For whatever reason, the "Ringer 2" workout in 2019 really favored tall women but was pretty neutral for the men. 

### Rank Correlations for Entire Events

In [None]:
gcols = [
    'year',
    'competitionType',
    'divisionName',
    'compDiv'
]
ec = dict(zip(gcols, [[] for _ in range(len(gcols))]))
for col in ('c', 'p', 'N'):
    ec[col] = []

for k, group in xft[xft.divisionNumber < 3].groupby(gcols):
    group = group.dropna(subset=['overallRank', 'height'])
    group = group.drop_duplicates(subset=['competitorName', 'height'])
    N = len(group)
    if N > 5:
        c, p = spearmanr(group.height.values, group.overallRank.values)
        for i,col in enumerate(gcols):
            ec[col].append( k[i] )
        ec['c'].append( c )
        ec['p'].append( p )
        ec['N'].append( N )

ec = pd.DataFrame(ec)
ec['palpha'] = 1 - ec['p']/2
ec['compDivColor'] = ec.compDiv.map(compDiv2color)

In [None]:
df = ec.sort_values(['year', 'compDiv'])
source = dict(
    x=list(
        zip(
            df.year.astype(str),
            df.compDiv
        )
    )
)
for col in ('c', 'p', 'N', 'year', 'palpha', 'divisionName', 'competitionType', 'compDivColor', 'compDiv'):
    source[col] = df[col].values

p = figure(
    title='Rank Correlation: Althete Height & Event Placement',
    y_axis_label='Spearman Rank Correlation',
    height=400,
    width=1200,
    x_range=FactorRange(
        *source['x'],
        group_padding=3
    ),
    tooltips=[
        ('year', '@year'),
        ('division', '@divisionName'),
        ('competition', '@competitionType'),
        ('correlation', '@c'),
        ('p-value', '@p'),
        ('N', '@N')
    ]
)
p.vbar(
    x='x',
    top='c',
    width='palpha',
    fill_color='compDivColor',
    fill_alpha='palpha',
    line_color='black',
    line_alpha='palpha',
    legend_group='compDiv',
    source=ColumnDataSource(source)
)
span = Span(
    dimension='width',
    location=0,
    line_color='gray',
    line_width=2,
    line_alpha=0.75
)
p.add_layout(span)
span.level = 'underlay'
#p.xaxis.major_label_orientation = np.pi/2
p.xaxis.major_label_text_alpha = 0
p.y_range.flipped = True
show(p)

Here is a final plot showing the correlations for **entire events**, not just individual workouts. The same kinds of observations can be made.
* The first stage of the 2020 Games really favored short people.
* The 2018 season was remarkably neutral with respect to height.
* The Open, as a whole, is not usually that strongly biased. It certainly tends to favor short people, though. The only instance noticeably favoring taller athletes was the 2019 Women's open.
* The Women's Games (red bars) pretty clearly favor taller athletes in recent years. From 2013 onward, the final results at the Women's Games usually advantage taller athletes, with exeptions in 2018 and 2021.
* For the Men's Games (blue bars) the correlation goes back and forth. Sometimes short people get lucky, sometimes tall people do. Note that 2019 favored tall people, then 2020 (stage 1) strongly favored short people. That was probably a disappointing shock for tall athletes in 2020.