### Surface model example

This example shows how to fit the model taking margin of victory and surface into account.

There is a set of convenient functions in `jax_elo.models.correlated_skills_model` which we will use for this.

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
#from jax.config import config

#config.update('jax_disable_jit', True)

from jax_elo.models.covariates_best_of_five_model import fit, calculate_ratings, get_player_skill_history
from jax_elo.utils.data import get_data

In [None]:
# Change this to where your data is located.
# This uses Jeff Sackmann's dataset, which is available here:
# https://github.com/JeffSackmann/tennis_atp
#data = get_data('/Users/ingramm/Projects/tennis/tennis-data/data/sackmann/tennis_atp/')

from tdata.datasets.oncourt_dataset import OnCourtDataset
from tdata.enums.t_type import Tours

dataset = OnCourtDataset(Tours.wta, drop_challengers=False, drop_qualifying=False)

In [None]:
data = dataset.get_stats_df()

In [None]:
from tdata.datasets.oncourt_dataset import calculate_sp_proportion, is_slam

In [None]:
to_use = data[data['start_date'].dt.year >= 2010]
to_use = to_use[to_use['surface'] != 'carpet']
to_use = to_use[to_use['tournament_rank'] > 0]

to_use['rank_to_use'] = to_use['tournament_rank']
to_use['rank_to_use'] = to_use['rank_to_use'].replace({
    0: 'Ch & Q',
    1: 'Ch & Q',
    2: 'Main tour',
    3: 'Main tour',
    4: 'Slam'
})
to_use.loc[(to_use['rank_to_use'] == 'Main tour') & (to_use['round'] <= 3), 'rank_to_use'] = 'Ch & Q'

In [None]:
spw_winner, spw_loser = calculate_sp_proportion(to_use)

In [None]:
to_use['spw_winner'] = spw_winner
to_use['spw_loser'] = spw_loser


In [None]:
# Compute the margin on the % of service points won
margins = to_use['spw_winner'] - to_use['spw_loser']

In [None]:
was_retirement = to_use['score'].str.contains('ret|w/o').values

In [None]:
import numpy as np

to_drop = (np.isnan(margins.values) & (~was_retirement)) | (to_use['tournament_rank'] == 6)

to_use = to_use[~to_drop]

margins = to_use['spw_winner'] - to_use['spw_loser']
was_retirement = to_use['score'].str.contains('ret|w/o').values

In [None]:
winner_is_wc = to_use['winner_seed'].str.contains('WC').fillna(False).values
loser_is_wc = to_use['loser_seed'].str.contains('WC').fillna(False).values

In [None]:
winner_info = {'is_wildcard': winner_is_wc}
loser_info = {'is_wildcard': loser_is_wc}

In [None]:
margins.max(), margins.min()

In [None]:
was_retirement.mean()

In [None]:
margins[was_retirement] = 0.

In [None]:
margins.sort_values().head(10)

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import jax.numpy as jnp

In [None]:
to_use['rank_to_use'].value_counts()

In [None]:
to_use.tail(30)

In [None]:
from jax_elo.core import EloParams

In [None]:
# Fit the model -- this might take a few minutes (but not more than that)
#params, opt_info = fit(to_use['winner'].values, to_use['loser'].values, to_use['surface'].values,
#                       to_use['rank_to_use'].astype(str).values, np.zeros(to_use.shape[0]),
#                       is_challenger=to_use['rank_to_use'].str.contains('Ch').values,
#                       margins=margins.values, was_retirement=was_retirement, verbose=True, 
#                       retirement_skill_factor=False, winner_wildcard=winner_is_wc,
#                       loser_wildcard=loser_is_wc)

#np.savez('2014_with_challengers_wta', **params.theta)

param_array = np.load('./2014_with_challengers_wta.npz')
param_array = {x: jnp.array(y) for x, y in param_array.items()}

params = EloParams(theta=dict(param_array))

In [None]:
params

In [None]:
# We can now calculate the rating history:
history, final_rating_dict, mark_names, rank_names = calculate_ratings(
    params, to_use['winner'].values, to_use['loser'].values, 
    to_use['surface'].values, to_use['rank_to_use'].astype(str).values, 
    np.zeros(to_use.shape[0]),
    is_challenger=to_use['rank_to_use'].str.contains('Ch').values,
    margins=margins.values, was_retirement=was_retirement,
    winner_info=winner_info,
    loser_info=loser_info)

In [None]:
to_use[to_use['winner'].str.contains('Brady')]

In [None]:
rank_names

In [None]:
#pd.DataFrame(params.theta['tournament_rank_offsets'], index=rank_names[1:],
#             columns = mark_names.tolist() + rank_names.tolist()).round(2)

In [None]:
history[0]

In [None]:
# Here's an example entry:
import pandas as pd
import matplotlib.pyplot as plt
from tpr.models.utils import to_elo_scale
%matplotlib inline

final_ratings = pd.DataFrame(final_rating_dict, index=mark_names.tolist() + rank_names.tolist()).T

slam_ratings = to_elo_scale(final_ratings['clay'] + final_ratings['Slam'])
final_ratings['Total'] = slam_ratings
#final_ratings['clay'] = to_elo_scale(final_ratings['clay'])
final_ratings['4'] = final_ratings['Total'] - final_ratings['clay']

final_ratings = final_ratings.rename(columns={'clay': 'Clay', '4': 'Slam+'})

In [None]:
(final_ratings.sort_values('Total', ascending=False).head(20)['Total']).astype(int)

In [None]:
to_elo_scale(0.6 * final_ratings['hard'] + 0.3 * final_ratings['Clay'] + 0.1 * final_ratings['grass']).sort_values(ascending=False).round(2)

In [None]:
#np.savez('fit_params_2014_with_chall', **params.theta)

In [None]:
final_ratings.sort_values('Total', ascending=False).drop(
    ['Na Li', 'Naomi Osaka', 'Ashleigh Barty', 'Victoria Azarenka', 'Serena Williams']).head(20)[
    ['Clay', 'Slam+', 'Total']].round(0).astype(int)

In [None]:
today_df = pd.read_csv('/home/martin/projects/tennis-data/data/oncourt/today_wta.csv')
player_df = pd.read_csv('/home/martin/projects/tennis-data/data/oncourt/players_wta.csv')

In [None]:
today_df[today_df['TOUR'] == 13350]

In [None]:
lookup = {row.ID_P: row.NAME_P for row in player_df.itertuples()}

In [None]:
today_df['p1'] = [lookup[row.ID1] for row in today_df.itertuples()]
today_df['p2'] = [lookup[row.ID2] for row in today_df.itertuples()]

In [None]:
not_played = today_df[today_df['RESULT'].isnull()]
uso = not_played[not_played['TOUR'] == 13350]
uso = uso[~uso['p1'].str.contains('/|Unknown') & ~uso['p2'].str.contains('/|Unknown')]

In [None]:
#not_played[not_played['p1'].str.contains('Altmaier')]

In [None]:
matches = uso[['p1', 'p2']]

In [None]:
matches

In [None]:
ratings_dict = final_ratings['Total'].to_dict()

In [None]:
matches['elo_p1'] = [ratings_dict.get(row.p1, 1500.) for row in matches.itertuples()]
matches['elo_p2'] = [ratings_dict.get(row.p2, 1500.) for row in matches.itertuples()]

In [None]:
matches['elo_sum'] = matches['elo_p1'] + matches['elo_p2']
matches['elo_diff'] = np.abs(matches['elo_p1'] - matches['elo_p2'])

In [None]:
from tpr.models.elo.basic_elo import EloPredictor

In [None]:
from jax_elo.models.covariates_best_of_five_model import margin_functions_retirement

a = np.array([0, 0, 1, 0, 0, 0, 1])

cur_y = {
    'margin': -1,
    'bo5': 0,
    'was_retirement': 0
}

win_probs = [margin_functions_retirement.win_prob_fun(final_rating_dict[row.p1],
                                                    final_rating_dict[row.p2],
                                                    np.concatenate([a, -a]),
                                                    cur_y, params)
             for row in matches.itertuples()]

matches['quality'] = EloPredictor.win_probability(1900., matches['elo_sum'] / 2)
matches['win_prob'] = np.array(win_probs)
matches['entropy'] = -(1 - matches['win_prob']) * np.log(1 - matches['win_prob']) - matches['win_prob'] * np.log(matches['win_prob'])

In [None]:
matches['overall'] = matches['entropy'] + matches['quality']

In [None]:
res = matches.sort_values('overall', ascending=False)[['p1', 'p2', 'win_prob',
                                                 'quality', 'entropy', 'overall']].round(3).head(30).reset_index(drop=True)

res.index += 1

res

In [None]:
0.36 * 3.95 - 1

In [None]:
# Kelly
(0.36 * (3.95) - 1) / (3.95 - 1)

In [None]:
0.14 * 42.89

In [None]:
0.3712 * (-10) + 0.2688 * 15 + 0.209 * 15 + 0.152* 40

In [None]:
from tpr.models.utils import odds_prob_equal_allocation

odds_prob_equal_allocation(3.05, 1.47)

In [None]:
matches[matches['p1'].str.contains('Ruus')]

In [None]:
to_check = 'Ons Jabeur'

cur_matches = [x for x in history if x['winner'] == to_check or x['loser'] == to_check]
cur_prior_mu = pd.DataFrame([x['prior_mu_winner'] if x['winner'] == to_check else x['prior_mu_loser']
                for x in cur_matches], columns=final_ratings.columns[:-2])

slam_skill = to_elo_scale(cur_prior_mu['Clay'] + cur_prior_mu['Slam'])
rel = (to_use['winner'] == to_check) | (to_use['loser'] == to_check)
rel_matches = to_use[rel]

rel_dates = rel_matches['start_date']
opponent_names = [x['loser'] if x['winner'] == to_check else x['winner'] for x in cur_matches]

In [None]:
player_df = pd.DataFrame({'date': rel_dates.values, 'opponent': opponent_names,
                          'rating': slam_skill})

#player_df.to_csv('/home/martin/projects/courses/d3/first_project/ruud_ratings.csv')

In [None]:
slam_skill

In [None]:
import plotly.express as pxe

pxe.scatter(x=rel_dates, y=slam_skill, hover_name=opponent_names)

In [None]:
win_probs = pd.Series([x['prior_win_prob'] for x in history])

In [None]:
not_ret = ~was_retirement
np.mean(np.log(win_probs)[not_ret & (to_use['round'].values >= 4) & (to_use['tournament_rank'].values >= 2)])

In [None]:
to_use['pred_prob'] = win_probs.values

In [None]:
to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
       (to_use['year'] == 2021)]['pred_prob'].sort_values()

In [None]:
from tpr.models.utils import odds_prob_equal_allocation

win_probs_odds = odds_prob_equal_allocation(to_use['winner_odds'], to_use['loser_odds'])

to_use['win_prob_odds'] = win_probs_odds

odds_available = ~to_use['win_prob_odds'].isnull()

In [None]:
np.log(to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
              (to_use['year'] == 2021) & odds_available]['pred_prob'].astype(float)).mean()

In [None]:
np.log(to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
              (to_use['year'] == 2021) & odds_available]['win_prob_odds'].astype(float)).mean()

In [None]:
(to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
        (to_use['year'] == 2021) & odds_available]['pred_prob'] > 0.5).mean()

In [None]:
(to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
              (to_use['year'] == 2021) & odds_available]['win_prob_odds'] > 0.5).mean()

In [None]:
probs_ao_model.index.get_level_values(0)

In [None]:
probs_ao_model = to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
              (to_use['year'] == 2021) & odds_available]['pred_prob'].astype(float)

probs_ao_odds = to_use[to_use['tournament_name'].str.contains('Australian Open - Melbourne') & 
              (to_use['year'] == 2021) & odds_available]['win_prob_odds'].astype(float)

pxe.scatter(x=probs_ao_model, y=probs_ao_odds, 
            hover_name=probs_ao_model.index.get_level_values(0) + ' bt. ' + 
            probs_ao_model.index.get_level_values(1), labels={'x': 'Model', 'y': 'Odds'})

In [None]:
# TODO: Could fit a logistic regression here to just the AO matches.

In [None]:
np.log(to_use[to_use['tournament_name'].str.contains('French Open') & (to_use['year'] == 2020)
              & (to_use['round'] >= 4)]['pred_prob'].astype(float)).mean()

In [None]:
(to_use[to_use['tournament_name'].str.contains('French Open') & (to_use['year'] == 2020)
        & (to_use['round'] >= 4)]['pred_prob'] > 0.5).mean()

In [None]:
np.log(to_use[to_use.year.isin([2018, 2019]) & ~was_retirement & (to_use['tournament_rank'] >= 2)
              & (to_use['round'] > 3)]['pred_prob'].astype(float)).mean()

In [None]:
to_use[to_use.year.isin([2018, 2019])& ~was_retirement & (to_use['tournament_rank'] >= 2)
       & (to_use['round'] > 3)].shape

In [None]:
to_use

In [None]:
np.log(to_use[to_use.year.isin([2018, 2019, 2020]) & ~was_retirement & (to_use['tournament_rank'] == 4)
              & (to_use['round'] >= 4) & odds_available & 
              (to_use['surface'] == 'clay').values]['win_prob_odds'].astype(float)).mean()

In [None]:
np.mean(np.log(win_probs)[not_ret & (to_use['round'].values >= 4) & (to_use['tournament_rank'].values == 4)
                          & odds_available.values & to_use.year.isin([2018, 2019, 2020]).values
                          & (to_use['surface'] == 'clay').values])

In [None]:
to_use.to_csv('with_odds_wta.csv')

In [None]:
df = pd.read_csv('/home/martin/projects/betfair_datathon/Final draw data pack/submission_file_womens_draw.csv')

In [None]:
known_players = set(final_ratings['Total'].index.values)

In [None]:
betfair = set(df['player_1'].values) | set(df['player_2'].values)

In [None]:
lookup = pd.Series({x: x in known_players for x in betfair})

In [None]:
lookup[~lookup]

In [None]:
ps = final_ratings.index
ps[ps.str.contains('Schmiedlova')]

In [None]:
# Translate from OnCourt to betfair
oc_to_bf_dict = {
    'Su-Wei Hsieh': 'Su Wei Hsieh',
    'Christina Mchale': 'Christina McHale',
    'Irina-Camelia Begu': 'Irina Camelia Begu',
    'Liudmila Samsonova': 'Ludmilla Samsonova',
    'Bianca Vanessa Andreescu': 'Bianca Andreescu',
    'Jil Teichmann': 'Jil Belen Teichmann',
    'Daria Kasatkina': 'Darya Kasatkina',
    'Barbora Strycova': 'Barbora Zahlavova Strycova',
    'Paula Badosa Gibert': 'Paula Badosa',
    'Anna Schmiedlova': 'Anna Karolina Schmiedlova'
}

In [None]:
final_rating_dict_bf = {oc_to_bf_dict.get(x, x): y for x, y in final_rating_dict.items()}

In [None]:
new_known = set(final_rating_dict_bf.keys())

betfair - new_known

In [None]:
mark_names.tolist() + rank_names.tolist()

In [None]:
a = np.array([0, 0, 1, 0, 0, 0, 1])

cur_y = {
    'margin': -1,
    'bo5': 0,
    'was_retirement': 0
}

win_probs = [margin_functions_retirement.win_prob_fun(final_rating_dict_bf[row.player_1],
                                                      final_rating_dict_bf[row.player_2],
                                                      np.concatenate([a, -a]),
                                                      cur_y, params)
             for row in df.itertuples()]

In [None]:
df['player_1_win_probability'] = pd.Series([float(x) for x in win_probs])

In [None]:
df

In [None]:
df.to_csv('/home/martin/projects/betfair_datathon/ext_elo_submission_file_womens_draw.csv', index=False)