In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import plotnine as p9

import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(threshold=50, edgeitems=20)
%matplotlib inline

EPS = 0.00001

In [None]:
def load_tennis_data(partition='train', data_dir='data/', train_data = None, save=True):
    df_mens = pd.read_csv(os.path.join(data_dir, 'mens_%s_file.csv' % partition))
    df_womens = pd.read_csv(os.path.join(data_dir, 'womens_%s_file.csv' % partition))
    
    df = df_mens.append(df_womens)
    df.columns = [col.replace('.', '_') for col in df.columns]
    
    # Is the point an expected winner?
    ## net.clearance (> 0)
    ## outside.sideline (False)
    ## outside.baseline (False)  
    
    df['clears_net'] = df['net_clearance'] > 0
    df['expected_winner'] = df['clears_net'] & -df['outside_sideline'] & -df['outside_baseline']   
    
    log_vars = ["speed", "previous_speed",
                "net_clearance", "previous_net_clearance",
                "depth", "previous_depth",
                "distance_from_sideline", "previous_distance_from_sideline",
                "player_distance_travelled",
                "player_depth", "player_impact_depth", "opponent_depth",
                "player_distance_from_center", "player_impact_distance_from_center", "opponent_distance_from_center",
                "previous_time_to_net"]
    
    # Define categorical variables
    categorical_vars = ["outcome", "serve", "hitpoint", "outside_sideline",
                       "outside_baseline", "same_side", "previous_hitpoint",
                       "server_is_impact_player", "gender", "clears_net", "expected_winner"]
    
    if train_data is not None:
        structured.apply_cats(df, train_data)
    else:
        for cat_var in categorical_vars: 
            df[cat_var] = df[cat_var].astype('category').cat.as_ordered()
        
    # Make floats
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        
    df.reset_index(inplace=True)
    
    # Drop certain variables
    if partition != 'test':
        drop_vars = ['train', 'index']
    else:
        drop_vars = ['train', 'index']
    
    df = df.drop(drop_vars, axis=1)
    
    # Log variables
    for var in log_vars:
        df[f'log_{var}'] = np.log(abs(df[var]) + EPS)
        
    # Output data
    if save:
        df.to_feather(os.path.join(data_dir, 'input/', f'{partition}_clean'), )
    
    return df 

## Load training data

In [None]:
df_train = load_tennis_data('train', save=False)
# df_test = load_tennis_data('test', train_data = df_train, save=False)

# df_test.dtypes

In [None]:
df_train['rally'] = df_train['rally'].astype('float32')
df_train['clears_net'] = df_train['clears_net'].astype('int32')

In [None]:
df_train['is_forced_error'] = df_train['outcome'] == 'FE'
df_train['is_unforced_error'] = df_train['outcome'] == 'UE'
df_train['is_winner'] = df_train['outcome'] == 'W'

remove_vars = ["serve", "hitpoint", "outside_sideline",               
               "outside_baseline", "same_side", "previous_hitpoint",
               "server_is_impact_player", "gender",
               "clears_net"]
log_vars = [col for col in df_train.columns if not col.find('log_')]

plot_cols = df_train.drop(remove_vars + log_vars, axis=1).columns.tolist()

corrmat = df_train[plot_cols].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat[['is_forced_error', 'is_unforced_error', 'is_winner']], 
            vmax=0.8, square=True, center = 0);

* Winner: long previous time to net, high previous shot, slow previous speed, not returned from deep, hit toward the sideline
* Unforced error: high speed, hit from deep, opponent is deep
* Forced error: low speed, previous shot close to sideline, previous shot hit hard

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_time_to_net', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by previous time to net'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_speed', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by previous speed'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'player_impact_depth', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by player impact depth'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'distance_from_sideline', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by distance from sideline'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_distance_from_sideline', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by previous distance from sideline'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'speed', fill = 'outcome')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Outcome by speed'))

## Rally length

In [None]:
df_train_rally = df_train.copy()
df_train_rally.loc[df_train_rally['rally'] > 16, 'rally'] = 16

(p9.ggplot(df_train_rally, p9.aes(x='rally')) + 
    p9.geom_histogram(binwidth=2) + 
    p9.facet_wrap('~ gender + outcome'))

In [None]:
df_train_rally.groupby(['gender', 'outcome'])['rally'].describe()

## Final and penultimate shot speed

In [None]:
(p9.ggplot(df_train, p9.aes(x='speed')) +
    p9.geom_histogram(bins = 60) +
    p9.facet_grid('outcome ~ gender'))

Forced errors come off your racket way slower than other shots. Winners come at the top pace although there may be some lob and drop shots. Unforced errors can be hit just as hard (but they probably go into the net or wide or deep).

In [None]:
(p9.ggplot(df_train, p9.aes(x='previous_speed')) +
    p9.geom_histogram(bins = 60) +
    p9.facet_grid('outcome ~ gender'))

Sometimes in a rally, you just mess up. On average, forced errors are caused by a little more pace on your opponent's shot. Winners come after significantly slower shots.

## Shot type

In [None]:
df_train_hitpoint_counts = df_train.groupby(['gender', 'hitpoint'])['rally'].count()
df_train_hitpoint = (df_train_hitpoint_counts / df_train_hitpoint_counts.groupby(['gender']) \
                     .transform(sum)).reset_index()


(p9.ggplot(df_train_hitpoint, p9.aes(x = 'hitpoint', y = 'rally')) +
    p9.geom_bar(stat = 'identity')  +
    p9.facet_wrap('~ gender') +
    p9.labs(title = 'Final hitpoint'))

In [None]:
df_train_hitpoint_counts = df_train.groupby(['gender', 'previous_hitpoint'])['rally'].count()
df_train_hitpoint = (df_train_hitpoint_counts / df_train_hitpoint_counts.groupby(['gender']) \
                     .transform(sum)).reset_index()


(p9.ggplot(df_train_hitpoint, p9.aes(x = 'previous_hitpoint', y = 'rally')) +
    p9.geom_bar(stat = 'identity')  +
    p9.facet_wrap('~ gender') +
    p9.labs(title = 'Penultimate hitpoint'))

In [None]:
df_train_hitpoint_counts = df_train.groupby(['gender', 'hitpoint', 'outcome'])['rally'].count()
df_train_hitpoint = (df_train_hitpoint_counts / df_train_hitpoint_counts.groupby(['gender', 'hitpoint']) \
                     .transform(sum)).reset_index()


(p9.ggplot(df_train_hitpoint, p9.aes(x = 'hitpoint', y = 'rally', fill = 'outcome')) +
    p9.geom_bar(stat = 'identity')  +
    p9.facet_wrap('~ gender') +
    p9.labs(title = 'Final hitpoint by outcome'))

In [None]:
df_train_penult_hitpoint_counts = df_train.groupby(['gender', 'previous_hitpoint', 'outcome'])['rally'].count()
df_train_penult_hitpoint = (df_train_penult_hitpoint_counts / df_train_penult_hitpoint_counts.groupby(['gender', 'previous_hitpoint']) \
                            .transform(sum)).reset_index()


(p9.ggplot(df_train_penult_hitpoint, p9.aes(x = 'previous_hitpoint', y = 'rally', fill = 'outcome')) +
    p9.geom_bar(stat = 'identity')  +
    p9.facet_wrap('~ gender') +
    p9.labs(title = 'Penultimate hitpoint'))

## Defining what a winner is

A winner _should_ be defined as a shot that is untouched by your opponent that goes over the net and lands inside the baseline and sidelines. Let's see if there is any variance in the HawkEye sensor data...

Maybe we should consider cases where the ball clips the net and drops in for a winner.

In [None]:
df_train.groupby('outcome').mean()

In [None]:
df_train_counts = df_train.groupby(['outcome', 'expected_winner'])['rally'].count()
(df_train_counts / df_train_counts.groupby(['outcome']) \
                     .transform(sum))

In [None]:
df_train_counts = df_train.groupby(['expected_winner', 'outcome'])['rally'].count()
(df_train_counts / df_train_counts.groupby(['expected_winner']) \
                     .transform(sum))

13.5% of "expected winners" are not actually winners. This could be a result of faulty sensor data and bad, unchallenged line calls.

In [None]:
df_train['faulty_expected_winner'] = (df_train['expected_winner'] == True) & (df_train['outcome'] != 'W')

df_expected_winner = df_train[df_train['expected_winner'] == True]
df_expected_winner.groupby('faulty_expected_winner')['rally'].count()

These shots sometimes have very large net clearance, some lower speed, higher previous speed

In [None]:
remove_vars = ["serve", "hitpoint", "outside_sideline",
               "outside_baseline", "same_side", "previous_hitpoint",
               "server_is_impact_player", "gender",
               "clears_net", "expected_winner",
               "outcome", "faulty_expected_winner"]
log_vars = [col for col in df_train.columns if not col.find('log_')]
plot_cols = df_train.drop(remove_vars + log_vars, axis=1).columns.tolist()

plot_df = pd.melt(df_expected_winner, id_vars = ['faulty_expected_winner'], value_vars = plot_cols)
# plot_df.groupby(['faulty_expected_winner', 'variable'])['value'].describe()

In [None]:
(p9.ggplot(df_expected_winner, p9.aes(x = 'log_net_clearance', fill = 'faulty_expected_winner')) +
    p9.geom_density(alpha = 0.4) +
    p9.labs(title = 'Faulty expected winner by net clearance'))

In [None]:
(p9.ggplot(df_expected_winner, p9.aes(x = 'previous_speed', y = 'speed', color = 'faulty_expected_winner')) +
    p9.geom_point() +
    p9.labs(title = 'Faulty expected winner by speed'))

In [None]:
# (p9.ggplot(plot_df, p9.aes(x = 'value', fill = 'faulty_expected_winner')) +
#     p9.geom_density(alpha = 0.4) +
#     p9.facet_wrap('~ variable', scales='free') +
#     p9.labs(title = 'Distributions by faulty expected winner') +
#     p9.theme(strip_text=p9.element_text(size = 4)))

## Locations

In [None]:
(p9.ggplot(df_train[df_train['net_clearance'] < 5], p9.aes(x = 'log_net_clearance')) +
    p9.geom_histogram()  +
    p9.facet_grid('outcome~clears_net') +
    p9.labs(title = 'Outcome by net clearance\ndoes it clear net?'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'log_distance_from_sideline')) +
    p9.geom_histogram()  +
    p9.facet_grid('outcome~outside_sideline') +
    p9.labs(title = 'Outcome by distance from sideline\nis it inbounds?'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'log_depth')) +
    p9.geom_histogram()  +
    p9.facet_grid('outcome~clears_net') +
    p9.labs(title = 'Outcome by distance from baseline\ndoes it clear net?'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'player_distance_travelled')) +
    p9.geom_histogram() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by player distance travelled'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'log_player_depth')) +
    p9.geom_histogram(bins = 20) +
    p9.facet_grid('hitpoint~outcome', scales='free') +
    p9.labs(title = 'Outcome by player depth'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'log_player_impact_depth')) +
    p9.geom_histogram(bins = 20) +
    p9.facet_grid('hitpoint~outcome', scales='free') +
    p9.labs(title = 'Outcome by player impact depth'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'player_depth', y = 'player_impact_depth')) +
    p9.geom_point(size = 0.5, alpha = 0.4) +
    p9.facet_grid('hitpoint~outcome', scales='free') +
    p9.labs(title = 'Outcome by player depth vs. impact depth\nhitpoint'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'player_distance_from_center')) +
    p9.geom_histogram() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by player distance from center'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_distance_from_sideline')) +
    p9.geom_histogram() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by previous distance from sideline'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_depth')) +
    p9.geom_histogram() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by previous depth'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'previous_depth', y = 'log_previous_net_clearance')) +
    p9.geom_point() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by previous depth and net clearance'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'opponent_distance_from_center')) +
    p9.geom_histogram() +
    p9.facet_grid('outcome~same_side') +
    p9.labs(title = 'Outcome by opponent distance from center\non same side?'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'opponent_distance_from_center', y = 'opponent_depth')) +
    p9.geom_point() +
    p9.facet_grid('outcome~same_side') +
    p9.labs(title = 'Outcome by opponent distance and depth\non same side?'))

In [None]:
(p9.ggplot(df_train, p9.aes(x = 'log_previous_time_to_net')) +
    p9.geom_histogram() +
    p9.facet_wrap('~outcome', ncol=1) +
    p9.labs(title = 'Outcome by previous depth'))