In [None]:
import pandas as pd
import numpy as np

import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
from sqlalchemy import create_engine
import re
import os
import fnmatch

import psutil

### acquiring and moving raw data

Stats csvs listing details about every match played in a calendar year were downloaded from https://github.com/JeffSackmann/tennis_atp. I wanted to work with and store the data on postgresql running on an AWS instance, so I secure copied (scp) the csvs to the AWS instance then began working with them from this notebook.

In [None]:
connection_args = {
    'host': '3.22.98.179',  # We are connecting to AWS version of psql
    'user': 'ubuntu', # username
    'port': 5432,          
    'password': 'inushiba'
}

In [None]:
engine = create_engine(f'''postgres://{connection_args["user"]}:{connection_args["password"]}@{connection_args["host"]}:{connection_args["port"]}/tennis''')
connection = engine.connect()

In [None]:
def get_match_csv(start_yr,end_yr,folder):
    '''creates a list of all the downloaded csv files for the specified year range'''
    filelist = os.listdir(folder)
    for file in filelist:
        if not fnmatch.fnmatch(file,'*csv'):
            filelist.remove(file)
    files=[]
    for year in range(start_yr,end_yr+1):
        for x in filelist:
            if re.search(str(year),x):
                files.append(re.search(str(year),x).string)
    return files


csv_list = get_match_csv(2014,2020,'./data')

This loop reads each file, adjusting the tourney_date column to the datetime format year-month-date, and then moving each to a SQL database using the SQLAlchemy engine from before.

In [None]:
folder = './data/'
for loc in csv_list:

    file = pd.read_csv(folder+loc)
    date = re.compile('date')
    columns = list(n for n in file.columns if re.search(date,n))
    for column in columns:
        file[column] = pd.to_datetime(file[column],format = '%Y%m%d')

    file.to_sql(loc.replace(folder,'').replace('.csv',''),engine,if_exists='replace',index=False)

### data collating
I want to be sure data types are consistent across the tables so I can join tables if neded.

In [None]:
query = """SELECT data_type FROM information_schema.columns
            WHERE table_name = 'atp_matches_2014';"""
col_types = connection.execute(query).fetchall()
col_types

In [None]:
def find_cols_to_fix(filename,col_types_key):
    ''' Inputs are one of the csv filenames and the key of data types to which you are comparing all others.
    Output is dict with key as column name and values are mismatching (current,reference) data types.'''
    table = filename.rstrip('.csv')
    query = f"""SELECT column_name FROM information_schema.columns
            WHERE table_name = '{table}';"""
    col_names_key = connection.execute(query).fetchall()
    query = f"""SELECT data_type FROM information_schema.columns
            WHERE table_name = '{table}';"""
    result = connection.execute(query).fetchall()
    if result != col_types_key:
        li=zip(col_names_key,result,col_types_key)
        col_to_fix = {}
        for ii,row in enumerate(li):
            if row[1] != row[2]:
                col_to_fix[row[0][0]] = (row[1][0],row[2][0])
        return col_to_fix

    
tables_to_fix = {}.fromkeys(csv_list)        
for file in csv_list:
    tables_to_fix[file] = find_cols_to_fix(file,col_types)

tables_to_fix

Some of the mismatched columns will not merge (numeric type vs. character type) while others are compatible. Those that are character and numeric are mismatched because the winner_seed and loser_seed columns include non-numeric designations.

In [None]:
def filter_cols_to_fix(tables_to_fix_dict):
    '''Input is the original nested dictionary of filenames as keys and columns with mismatched data types.
    Output is nested dictionary filtered only to include filenames, column names, and incompatible data types (character --> numeric).'''
    filename_list = []
    for filename,cols_to_fix in tables_to_fix_dict.items():
        if cols_to_fix:
            filename_list.append(filename)
    filtered_dict = {}.fromkeys(filename_list)

    for filename in filename_list:
        filtered_cols = {}
        for key in tables_to_fix_dict[filename].keys():
            
            if 'text' in tables_to_fix_dict[filename][key]:
                filtered_cols[key] = tables_to_fix_dict[filename][key]
        
        filtered_dict[filename] = filtered_cols
    filtered_dict = {key:val for key,val in filtered_dict.items() if val}
    return filtered_dict


tables_to_fix = filter_cols_to_fix(tables_to_fix)
tables_to_fix

I want the "seed" columns to be numeric only; those that are character are including non-numeric values that I need to find and remove before converting the column to numeric type.

In [None]:
text_values = []
for file in tables_to_fix.keys():
    for col_name,data_types in tables_to_fix[file].items():
        
        query = f"""SELECT DISTINCT {col_name} FROM {file.rstrip(".csv")}
                ORDER BY {col_name} DESC;"""

        result = [x.values()[0] for x in connection.execute(query) if x.values()[0] and x.values()[0].isalpha()]
        text_values.extend(result)
text_values = list(set(text_values))
text_values

Now, I need to move those non-numeric "seed" column values to the "entry" column, to match the majority of tables. I'll then replace those non-numeric values with null values then change the column data type to double precision.

In [None]:
def update_tables(tables_to_fix_dict,val_to_change):
    """Inputs are (1) nested dictionary of filenames as keys and columns with mismatched data types, filtered to only include incompatible data types,
    and (2) list of data entries that need to be changed. This will change the data types of the mismatched columns, first retaining the incompatible
    information in those columns"""
    
    ref_dict = {'winner_seed':'winner_entry','loser_seed':'loser_entry'}
    for filename,cols_to_fix in tables_to_fix_dict.items():
        for col in cols_to_fix.keys():
            query = f"""UPDATE {filename.rstrip('.csv')}
                    SET {ref_dict[col]} = {col}
                    WHERE {col} in {tuple(val_to_change)};"""
            connection.execute(query)
            
            query = f"""UPDATE {filename.rstrip('.csv')}
                    SET {col} = NULL
                    WHERE {col} in {tuple(val_to_change)};"""
            connection.execute(query)
            
            query = f"""ALTER TABLE {filename.rstrip('.csv')}
                    ALTER COLUMN {col} TYPE FLOAT8 USING {col}::double precision;"""
            connection.execute(query)



update_tables(tables_to_fix,text_values)


Now, I want to merge the tables to one giant table.

In [None]:
query = """CREATE TABLE atp_all_matches AS
        (SELECT * from atp_matches_2014
        UNION
        SELECT * from atp_matches_2015
        UNION
        SELECT * from atp_matches_2016
        UNION
        SELECT * from atp_matches_2017
        UNION
        SELECT * from atp_matches_2018
        UNION
        SELECT * from atp_matches_2019
        UNION
        SELECT * from atp_matches_2020
        UNION
        SELECT * from atp_matches_qual_chall_2014
        UNION
        SELECT * from atp_matches_qual_chall_2015
        UNION
        SELECT * from atp_matches_qual_chall_2016
        UNION
        SELECT * from atp_matches_qual_chall_2017
        UNION
        SELECT * from atp_matches_qual_chall_2018
        UNION
        SELECT * from atp_matches_qual_chall_2019
        UNION
        SELECT * from atp_matches_qual_chall_2020
        ORDER BY tourney_date ASC);"""

connection.execute(query)

### data selecting
Now I need functions to pick out just the relevant rows from the giant SQL table that I'd called atp_all_matches and move them into Python for further analysis. The relevant rows are those that include the Grand Slam draw and the matches those entrants played some time interval prior to the start of the Grand Slam. Note that the tourney_date is unchanging throughout a tournament - that is; it doesn't reflect the precise date a match occurred but rather the official start of the tourney. As such, there is an additional filter that grabs any of the qualifying matches that technically occur the days before the official start of a Grand Slam but are otherwise eliminated by the time interval.

In [None]:
def get_gs(start_year = 2015,end_year = 2020):
    """Inputs are start and end years of interest (minimum year = 2015), returns DataFrame of grand slam names,
    IDs, and start date."""
        
    query = f"""SELECT DISTINCT tourney_id, tourney_name, tourney_date FROM atp_all_matches
            WHERE tourney_level = 'G' AND
            tourney_date BETWEEN '{str(start_year)+'/01/01'}'::date AND '{str(end_year)+'/12/31'}'::date
            ORDER BY tourney_date ASC;"""

    gs = pd_sql.read_sql(query,connection).reset_index(drop=True)

    return gs

In [None]:
def get_prior_stats(gs_date,weeks=6):
    """Inputs are the Grand Slam tourney date and the interval of time before the tourney 
    from which stats will be accumulated."""
    interval_start = gs_date - pd.to_timedelta(weeks,unit='W')
    interval_end = gs_date - pd.to_timedelta(1,unit='D')
    
    query = f"""CREATE TEMP TABLE previous AS 
    (SELECT * from atp_all_matches 
    WHERE tourney_date BETWEEN '{interval_start.date()}' AND '{interval_end.date()}'
    UNION
    SELECT * from atp_all_matches
    WHERE tourney_date = '{gs_date.date()}' AND round in ('Q1','Q2','Q3'));"""
    
    connection.execute(query)

    query = f"""SELECT gs.GS_player_name,gs.GS_player_id,gs.GS_player_entry,gs.GS_player_seed,
    gs.GS_player_hand,gs.GS_player_ht,gs.GS_player_ioc,gs.GS_player_age,
    gs.GS_player_rank,gs.GS_player_rank_points,gs.GS_tourney_id,gs.GS_tourney_name,gs.GS_surface,
    p.* FROM gs_draw gs
    LEFT JOIN previous p
    ON (gs.GS_player_name = p.winner_name)
    WHERE GS_tourney_date = '{gs_date.date()}'
    UNION
    SELECT gs.GS_player_name,gs.GS_player_id,gs.GS_player_entry,gs.GS_player_seed,
    gs.GS_player_hand,gs.GS_player_ht,gs.GS_player_ioc,gs.GS_player_age,
    gs.GS_player_rank,gs.GS_player_rank_points,gs.GS_tourney_id,gs.GS_tourney_name,gs.GS_surface,
    p.* FROM gs_draw gs
    LEFT JOIN previous p
    ON (gs.GS_player_name = p.loser_name)
    WHERE GS_tourney_date = '{gs_date.date()}'
    ORDER BY GS_tourney_id;
    """
    
    return pd_sql.read_sql(query,connection)

In [None]:
def get_gs_results(start_year = 2015,end_year = 2020,weeks = 6):
    """Get draws from all GS of desired year(s); add column of True/False (yes/no in QF) for later prediction.
    Separately get all matches involving players in GS main draw from those same years occurring some weeks (default=6) 
    before the start of each grand slam. Inputs are start and end year as well as number of weeks."""
    
#     gs = get_gs(start_year,end_year)
    
    query = f"""CREATE TEMP TABLE gs_draw AS
            (SELECT tourney_id AS GS_tourney_id,tourney_name AS GS_tourney_name,surface AS GS_surface,tourney_date AS GS_tourney_date,winner_id AS GS_player_id,
            winner_seed AS GS_player_seed,winner_entry AS GS_player_entry,winner_name AS GS_player_name,winner_hand AS GS_player_hand,winner_ht AS GS_player_ht,
            winner_ioc AS GS_player_ioc,winner_age AS GS_player_age,winner_rank AS GS_player_rank,winner_rank_points AS GS_player_rank_points
            FROM atp_all_matches
            WHERE tourney_level = 'G' AND round = 'R128' AND tourney_date BETWEEN '{str(start_year)+"/01/01"}'::date and '{str(end_year)+"/12/31"}'::date)
            UNION
            (SELECT tourney_id AS GS_tourney_id,tourney_name AS GS_tourney_name,surface AS GS_surface,tourney_date AS GS_tourney_date,loser_id AS GS_player_id,
            loser_seed AS GS_player_seed,loser_entry AS GS_player_entry,loser_name AS GS_player_name,loser_hand AS GS_player_hand,loser_ht AS GS_player_ht,
            loser_ioc AS GS_player_ioc,loser_age AS GS_player_age,loser_rank AS GS_player_rank,loser_rank_points AS GS_player_rank_points
            FROM atp_all_matches
            WHERE tourney_level = 'G' AND round = 'R128' AND tourney_date BETWEEN '{str(start_year)+"/01/01"}'::date and '{str(end_year)+"/12/31"}'::date)
            ORDER BY GS_tourney_date ASC"""

    connection.execute(query)
        
    query = """ALTER TABLE gs_draw
            ADD COLUMN qf BOOL DEFAULT False;"""
    connection.execute(query)
        
    query = """CREATE TEMP TABLE winners AS
            SELECT GS_player_name,GS_tourney_date FROM gs_draw
            INTERSECT
            SELECT winner_name,tourney_date FROM atp_all_matches
            WHERE round = 'R32';"""
    connection.execute(query)
        
    query = """UPDATE gs_draw
            SET qf = True
            FROM winners
            WHERE gs_draw.GS_player_name = winners.GS_player_name 
            AND gs_draw.GS_tourney_date = winners.GS_tourney_date;"""
    connection.execute(query)
        
    query = "DROP TABLE winners;"
    connection.execute(query)
    
    query = "SELECT * FROM gs_draw ORDER BY GS_tourney_date ASC;"
    gs_draws = pd_sql.read_sql(query,connection)

    query = "SELECT DISTINCT GS_tourney_date FROM gs_draw ORDER BY GS_tourney_date ASC;"
    gs_dates = pd_sql.read_sql(query,connection)
    
    stats = pd.DataFrame()
    for date in gs_dates.gs_tourney_date:
        df = get_prior_stats(date,weeks=weeks)
        stats = stats.append(df)
        query = "DROP TABLE previous;"
        connection.execute(query)
    
    query = "DROP TABLE gs_draw"
    connection.execute(query)
    connection.close()
    engine.dispose()
    stats = stats.reset_index(drop=True)
    return stats,gs_draws

In [None]:
stats,gs_draws = get_gs_results()

To delete the temporary table gs_draws, you have to close the connection and begin anew, which occurs in the last few lines of get_gs_results(). To rerun the previous line, you have to open a fresh connection first.

In [None]:
connection = engine.connect()

### analysis + feature engineering
Now I have draws which are the 128 players in the main draw (past qualifying rounds) of each Grand Slam. I also have info regarding the matches those players played from the 6 weeks leading up to the Grand Slam. I need to connect that match-specific data to the Grand Slam draws.

Things I want to account for include:
* number of matches: calculate number of matches played in prior weeks' interval

* number of wins: wins in prior weeks' interval

* surface familiarity: of those warmup matches, how many were on the same surface as the Grand Slam

* home court advantage: true/false if Grand Slam is in a player's home country

In [None]:
stats['same_surface'] = stats.gs_surface == stats.surface
stats['wins'] = stats.gs_player_name == stats.winner_name

same_surface = stats.groupby(by=['gs_tourney_id','gs_player_name'])['same_surface'].sum()
num_matches = stats.groupby(by=['gs_tourney_id','gs_player_name'])['tourney_name'].count()
num_wins = stats.groupby(by=['gs_tourney_id','gs_player_name'])['wins'].sum()

gs_draws['num_matches'] = 0
gs_draws['same_surface'] = 0
gs_draws['num_wins'] = 0
num_matches_index = np.where(gs_draws.columns == 'num_matches')[0][0]
same_surface_index = np.where(gs_draws.columns == 'same_surface')[0][0]
wins_index = np.where(gs_draws.columns == 'num_wins')[0][0]
gs_draws['home_country'] = gs_draws.gs_tourney_name.replace(to_replace={'Roland Garros':'FRA','Australian Open':'AUS','Wimbledon':'GBR','US Open':'USA'}) == gs_draws.gs_player_ioc
for idx,series in gs_draws[['gs_tourney_id','gs_player_name']].iterrows():
    gs_draws.iloc[idx,num_matches_index] = num_matches[series[0]][series[1]]
    gs_draws.iloc[idx,same_surface_index] = same_surface[series[0]][series[1]]
    gs_draws.iloc[idx,wins_index] = num_wins[series[0]][series[1]]

I also want to total match-specific stats in those warmup matches. I have to reorganize the columns for each match to be centered around the player in the Grand Slam draw - instead of winner and loser stats, it must be as (Grand Slam) player and opponent stats.

In [None]:
stats = stats.join(stats[stats.gs_player_name == stats.winner_name][['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced','winner_rank','winner_rank_points','loser_rank','loser_rank_points']].rename(columns={'w_ace':'ace', 'w_df':'df', 'w_svpt':'svpt', 'w_1stIn':'1stIn', 'w_1stWon':'1stWon', 
        'w_2ndWon':'2ndWon', 'w_SvGms':'SvGms', 'w_bpSaved':'bpSaved', 'w_bpFaced':'bpFaced','winner_rank':'player_rank','winner_rank_points':'player_rank_points','loser_rank':'opponent_rank','loser_rank_points':'opponent_rank_points'}))

stats.update(stats[stats.gs_player_name == stats.loser_name][['l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms',
       'l_bpSaved', 'l_bpFaced','winner_rank','winner_rank_points','loser_rank','loser_rank_points']].rename(columns={'l_ace':'ace', 'l_df':'df', 'l_svpt':'svpt', 'l_1stIn':'1stIn', 'l_1stWon':'1stWon', 
        'l_2ndWon':'2ndWon', 'l_SvGms':'SvGms', 'l_bpSaved':'bpSaved', 'l_bpFaced':'bpFaced','loser_rank':'player_rank','loser_rank_points':'player_rank_points','winner_rank':'opponent_rank','winner_rank_points':'opponent_rank_points'}))

stats = stats.drop(columns=['winner_id', 'winner_seed', 'winner_entry', 'winner_name',
       'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_id',
       'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht',
       'loser_ioc', 'loser_age', 'score', 'best_of', 'round', 'minutes',
       'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn',
       'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'])

I'm also turning the tourney levels of these warmup matches into dummy variables and joining those to the stats DataFrame.

In [None]:
stats = stats.join(pd.get_dummies(stats.tourney_level,prefix='tourney_level')).drop(columns=['tourney_level'])


In [None]:
match_stats = stats.groupby(by=['gs_tourney_id','gs_player_name'])[['ace','df','svpt','1stIn','1stWon','2ndWon','SvGms',
        'bpSaved','bpFaced','player_rank','player_rank_points','opponent_rank','opponent_rank_points','tourney_level_A',
        'tourney_level_C','tourney_level_G','tourney_level_M']].agg([np.sum,np.mean,'count'])

As most of the stats are based on serve, I decided to normalize those by the total number of points on serve (svpt). I also normalized the number of service points by the number of service games (SvGms) played.

In [None]:
match_stats_labels = ['ace_per_svpt','df_per_svpt','svpt_per_SvGms','1stIn_per_svpt','1stWon_per_svpt',
'2ndWon_per_svpt','bpSaved_per_svpt','bpFaced_per_svpt','mean_player_rank','mean_player_rank_points','mean_opponent_rank',
'mean_opponent_rank_points','count_A','count_C','count_G','count_M']

for label in match_stats_labels:
    gs_draws[label] = 0    
    label_index = np.where(gs_draws.columns == label)[0][0]

    if label.startswith('mean'):
        label_ = label.replace('mean_','')
        calc = match_stats[label_]['mean']
        for idx,series in gs_draws[['gs_tourney_id','gs_player_name']].iterrows():
            gs_draws.iloc[idx,label_index] = calc[series[0]][series[1]]
    elif label.endswith('per_svpt'):
        label_ = label.replace('_per_svpt','')
        calc = match_stats[label_]['sum']
        for idx,series in gs_draws[['gs_tourney_id','gs_player_name']].iterrows():                
            gs_draws.iloc[idx,label_index] = calc[series[0]][series[1]]/match_stats['svpt']['sum'][series[0]][series[1]]
    elif label.endswith('per_SvGms'):
        label_ = label.replace('_per_SvGms','')
        calc = match_stats[label_]['sum']
        for idx,series in gs_draws[['gs_tourney_id','gs_player_name']].iterrows():                
            gs_draws.iloc[idx,label_index] = calc[series[0]][series[1]]/match_stats['SvGms']['sum'][series[0]][series[1]]
    elif label.startswith('count'):
        label_ = label.replace('count_','tourney_level_')
        calc = match_stats[label_]['sum']
        for idx,series in gs_draws[['gs_tourney_id','gs_player_name']].iterrows():                
            gs_draws.iloc[idx,label_index] = calc[series[0]][series[1]]

The normalizing and averaging can lead to NaN values, e.g., if svpt == 0. I need to find the columns with NaN values and decide how best to fill them.

In [None]:
print(gs_draws.columns[gs_draws.isna().any().to_list()])

For the match-specific stats, I could get rid of the NaNs by increasing the past weeks' data interval but the idea in using a limited window was to reflect any players seemingly on a hot streak. As such, I'll just fill those with zeros - no matches played.

In [None]:
values = {'ace_per_svpt': 0,'df_per_svpt': 0,'svpt_per_SvGms': 0,'1stIn_per_svpt': 0,
          '1stWon_per_svpt': 0,'2ndWon_per_svpt': 0,'bpSaved_per_svpt':0,'bpFaced_per_svpt': 0}
gs_draws = gs_draws.fillna(value=values)

The GS seeds will have many NaN values since seeds are only the top 32 in the draw. Sometimes players will be numbered above 32 (e.g., 33) if someone in the original 32 seeds late but before the start of play. Those seedings are based on the overall ATP tour ranking, which itself is based on the players' running points total, though the precise methodology is not the same for each Grand Slam. I could have filled those NaNs with some nominal constant, but instead I'm sorting by the ATP rankings and points totals, then using that to seed the rest of the draw.

In [None]:
unique_tourney_ids = gs_draws.gs_tourney_id.unique()
for tourney_id in unique_tourney_ids:
    max_seed = gs_draws.gs_player_seed[gs_draws.gs_tourney_id == tourney_id].max()+1
    indices = gs_draws[gs_draws.gs_tourney_id == tourney_id].sort_values(by=['gs_player_rank','gs_player_rank_points'],ascending=[True,False]).index
    indices = gs_draws.iloc[indices].gs_player_seed[gs_draws.iloc[indices].gs_player_seed.isna()].index
    
    for index_ in indices:
        gs_draws.loc[index_,'gs_player_seed'] = max_seed
        max_seed += 1

The GS player entry is a mostly empty column because it only labels qualifiers, wild cards, and lucky losers, all of which make up a small fraction of the main draw of 128. I've turned these into dummy columns also.

In [None]:
gs_draws = gs_draws.join(pd.get_dummies(gs_draws.gs_player_entry)).drop(columns=['gs_player_entry'])

There are a small number of entrants without any points and thus no ranking. This probably means a player has not been playing ATP-level matches for a year due to injury (the rolling points total is calculated over a year) but qualified for the main draw due to a protected ranking. So, rank points NaN values can be filled with zeros, and then I'm filling the ranking NaN values with the the maximum ranking in the draw + 1.

In [None]:
gs_draws.gs_player_rank_points = gs_draws.gs_player_rank_points.fillna(value=0)

In [None]:
unique_tourney_ids = gs_draws.gs_tourney_id.unique()
for tourney_id in unique_tourney_ids:
    if gs_draws[gs_draws.gs_tourney_id == tourney_id].gs_player_rank.hasnans:
        max_ranking = gs_draws.gs_player_rank[gs_draws.gs_tourney_id == tourney_id].max()+1
        indices = gs_draws[gs_draws.gs_player_rank.isna()][gs_draws.gs_tourney_id == tourney_id].index
        gs_draws.loc[indices,'gs_player_rank'] = max_ranking


I'm dropping the mean_player_rank and mean_player_rank_points columns since I'm already incorporating both rank and points with the gs_player_rank and gs_player_rank_points columns. I'm also dropping the mean_opponent_rank column since I don't have a good way to fill those NaNs. I'm keeping the mean_opponent_rank_points and filling those NaN values with zeros - no prior opponents means no opponent points.

In [None]:
gs_draws = gs_draws.drop(columns=['mean_player_rank','mean_player_rank_points','mean_opponent_rank'])

gs_draws.mean_opponent_rank_points.fillna(0,inplace=True)

The player_hand column lists R or L if the player is right- or left-handed. There are a handful of players listed as U, which I'm guessing means unknown at the time. I'm just going to replace the U with R - it's far more common for players to be right-handed. I'm then going to convert that column into a dummy variable.

In [None]:
gs_draws.gs_player_hand.value_counts()

In [None]:
gs_draws.gs_player_hand = gs_draws.gs_player_hand.replace('U','R')
gs_draws = gs_draws.join(pd.get_dummies(gs_draws.gs_player_hand,drop_first=True)).drop(columns='gs_player_hand')

For the player height, I want to fill the NaN values with the average player height in the draw. If I were filling the entirety with a single value, I'd be concerned with first splitting into train, validation, and test sets as to avoid any data leakage. However, I'm splitting along tournies and filling any NaNs as the average player height in that tournament, so I can fill those now.

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

for tourney_id in gs_draws.gs_tourney_id.unique():
    idx = gs_draws[gs_draws.gs_tourney_id == tourney_id].index
    filled_ht = imp.fit_transform(np.array(gs_draws.loc[idx,'gs_player_ht']).reshape(-1,1))
    gs_draws.loc[idx,'gs_player_ht'] = filled_ht


### fitting estimators

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,recall_score,f1_score,precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from boruta import BorutaPy


In [None]:
def get_user_split_data(df,user_column,y_column,test_size=.25,seed=87):
    """Split entire dataset into train and test portions splitting along an internal delineating label - like a user name or,
    in this case, the different tournaments. Input a dataframe df,username column user_column and result column y_column as
    strings, test size and random seed. Outputs are train and test X dataframes and y columns."""
    rs = np.random.RandomState(seed)

    users = df[user_column].unique() 
    test_users = rs.choice(users,size=int(users.shape[0] * test_size),replace=False)

    df_tr = df[~df[user_column].isin(test_users)]
    df_te = df[df[user_column].isin(test_users)] 

    y_tr, y_te = df_tr[y_column], df_te[y_column]
    X_tr = df_tr.drop([y_column],axis=1) 
    X_te = df_te.drop([y_column],axis=1)

    return X_tr, X_te, y_tr, y_te


def plot_features(df,y_col,sample_size=300):
    """Plot features of df with different classifications as different colors."""
    sample = df.sample(sample_size, random_state=87)
    sns.pairplot(sample,hue=y_col, plot_kws=dict(alpha=.3, edgecolor='none'))

    
def make_confusion_matrix(model,x_data,y_data,threshold=0.5):
    """Predict class 1 if probability of being in class 1 is greater than threshold
    (model.predict(X_test) does this automatically with a threshold of 0.5)"""
    y_predict = (model.predict_proba(x_data)[:, 1] >= threshold)
    model_confusion = confusion_matrix(y_data, y_predict)
    plt.figure(dpi=50,figsize=[6.4,4.8])
    sns.set(font_scale=1.2)
    sns.heatmap(model_confusion, cmap=plt.cm.YlGnBu, annot=True, square=False, fmt='d',
           xticklabels=['0', '1'],
           yticklabels=['0', '1']);
    plt.xlabel('prediction')
    plt.ylabel('actual')

I'm now dropping the columns that I don't want to use for the analysis, keeping the names in case I want to look back at them later. I also want to remove the results from the 2020 Australian Open, which was ongoing at the time when I originally wrote this code. I'll use those results as a second "test" set as the tourney had ended by the time I'd finished this.

In [None]:
gs_draws_names = gs_draws['gs_player_name']
gs_draws_names_2020 = gs_draws[gs_draws.gs_tourney_id == '2020-580'].gs_player_name
gs_draws = gs_draws.drop(columns=['gs_tourney_name', 'gs_surface', 'gs_tourney_date','gs_player_id','gs_player_name','gs_player_ioc'])
gs_draws_2020 = gs_draws[gs_draws.gs_tourney_id == '2020-580']
gs_draws = gs_draws[gs_draws.gs_tourney_id != '2020-580']

In [None]:
ros = RandomOverSampler(random_state=87)
scaler = StandardScaler()
group_kfold = GroupKFold(n_splits=5)


Using the plot_features function, I can try to see which features might be most important in distinguishing the two classes. It does take a bit of time to run and is difficult to see all the plots so is better to plot a subset of the features. To run this, you must uncomment importing seaborn and matplotlib

In [None]:
plot_features(gs_draws[[#'gs_player_seed','gs_player_ht','gs_player_age',
#                        'gs_player_rank','gs_player_rank_points',#'num_matches','same_surface','num_wins',
#                        'home_country','ace_per_svpt',
                        'df_per_svpt','svpt_per_SvGms','1stIn_per_svpt','1stWon_per_svpt',
#                        '2ndWon_per_svpt','bpSaved_per_svpt','bpFaced_per_svpt','mean_opponent_rank_points',
#                        'count_A','count_C','count_G','count_M','LL','Q','WC','R',
                        'qf']],'qf')

#### full feature set
I wanted to use GridSearchCV to determine the best fitting parameters for each of the estimators tested. This can be time- and CPU-intensive so I sometimes specified the n_jobs parameter to run multiple processors in parallel. This in itself became an issue; there seems to be a bug when having loaded matplotlib and employing multiprocessing (n_jobs != 1) in Python 3.7 and MacOS >= Catalina 10.15 (I'm on Big Sur 11.0.1). See [here](https://github.com/matplotlib/matplotlib/issues/15410) for more info. My current workaround was to not import matplotlib until absolutely needed, otherwise I can't use multiprocessing without restarting the kernel.

For GridSearch I wasn't sure which scoring metric I wanted to optimize, so I set it to optimize recall, precision, and f1 and would decide afterwards how best to choose.


In [None]:

knn_gs = KNeighborsClassifier()
param_grid = {'classification__n_neighbors':range(1,16),'classification__weights':['uniform','distance']}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', knn_gs)
    ])

knn_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf)
    clf.fit(np.array(X_train),np.array(y_train))
    knn_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
logit_gs = LogisticRegression(max_iter=1000,solver='liblinear',random_state=87)
param_grid = {'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', logit_gs)
    ])

logit_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf)
    clf.fit(np.array(X_train),np.array(y_train))
    logit_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
tree_gs = DecisionTreeClassifier(random_state=87)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
              'classification__criterion':['gini','entropy']}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', tree_gs)
    ])
# 
tree_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    tree_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
forest_gs = RandomForestClassifier(random_state=87)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
              'classification__n_estimators':[10,50,100,200]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', forest_gs)
    ])

forest_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    forest_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
svm_gs = SVC(random_state=87,gamma='auto',probability=True)
param_grid = {'classification__kernel':['linear','rbf','sigmoid'],'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', svm_gs)
    ])

svm_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    svm_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
poly_gs = SVC(random_state=87,kernel='poly',gamma='auto',probability=True)
param_grid = {'classification__degree':[2,3,4],'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', poly_gs)
    ])

poly_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    poly_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
xt_gs = ExtraTreesClassifier(random_state=87,bootstrap=True)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],'classification__n_estimators':[10,50,100,200]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', xt_gs)
    ])

xt_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    xt_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
xg_gs = XGBClassifier(random_state=87,probability=True)
param_grid = {'classification__max_depth':range(1,13),'classification__learning_rate':[.1,.5,1],'classification__n_estimators':[50,100,500,1000]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', xg_gs)
    ])

xg_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    xg_[str(scoring_metrics)] = clf.best_estimator_

Since only 16 of the 128 players will make the round of 16 (R16), it doesn't make sense to be setting a threshold cutoff to above which the estimator predicts True and below, False. Instead, I'm going to use the estimator to calculate the probability of each player qualifying for R16 and take the top 16 as the predicted qualifiers. In theory, this means the number of false positivies would equal the number of false negatives. However, there could be multiple players with the same odds, especially for a tree-based estimator, so some may predict more than 16 players qualifying for R16 and thus precision, recall, and f1 wouldn't be equal. Rather than collate precision, recall, and f1, I'm just calculating precision and recall and then picking the minimum between the two. That way, if I am predicting more than 16 players making the R16, I am not artificially inflating the scoring metric.

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])

fittings = {'knn':knn_,'logit':logit_,'tree':tree_,'forest':forest_,'svm':svm_,'poly':poly_,'xt':xt_,'xg':xg_}
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
results_ = pd.DataFrame().from_dict(dict.fromkeys(map(sc,metrics),(dict.fromkeys(fittings,{}))))

for scorer,y in results_.items():
    
    for label in y.keys():
        fitting = fittings[label][scorer.split('_')[0]]

        Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(fitting.predict_proba(X_test)[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
        Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
        for tourney_id in unique_tourney_ids:
            unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
            count = 0
            while count < 16:
                pp_high = unique_pp[0]
                mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
                Xy_test_with_predictions.loc[mask,'predict'] = True
                unique_pp = unique_pp[1:]
                count += len(mask)

        results_.loc[label,scorer] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
        

In [None]:
results_

The most successful estimator is XGBoost when using recall score as the metric against which GridSearch optimizes. How does this compare to the results one would get when predicting using just the built-in seeds?

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])

unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().sort_values(by=['gs_tourney_id','gs_player_seed'],ascending=[True,True])[['index','gs_tourney_id','gs_player_seed','qf']].reset_index(drop=True)
Xy_test_with_predictions.insert(3,'predict',([True]*16+[False]*(128-16))*len(unique_tourney_ids))

result_seeds = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
print(result_seeds)

So the seeds outperform just predicting the R16 qualifiers from their seeded positions at the start of the tourney, which is one of the features in the data. Looking at the feature importances from the best estimator, I can see that results are heavily reliant on the seed value and ignoring other features.

In [None]:
list(zip(xg_['recall']['classification'].feature_importances_,X_test.columns))

#### reduce features with BorutaPy

I originally used all 26 features but looking at the feature_importances_ or coef_ attributes in some of the  estimators, I could see that there were unused features. Knowing also that the seeding for tournaments alone, which itself is reliant on the ranking points, is designed to find those qualifying to the R16, it would seem that I could improve the results using fewer features. Using all features may increase the variance in the resulting model, so I used the Boruta feature selection method described [here](https://danielhomola.com/feature%20selection/phd/borutapy-an-all-relevant-feature-selection-method/).

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_train = X_train.drop(columns=['gs_tourney_id'])

forest = RandomForestClassifier(n_estimators=200,random_state=87,max_depth=5,n_jobs=-2)
bp = BorutaPy(forest,random_state=87,verbose=0)
bp.fit(X_train.values,y_train.values)

The results picked out 10 of the 26 features to be important, most of which are ranking and seeding.

In [None]:
X_train.columns[bp.support_]

In [None]:
feat_subset = ['gs_player_seed', 'gs_player_ht', 'gs_player_age', 'gs_player_rank',
       'gs_player_rank_points', 'df_per_svpt', '1stWon_per_svpt',
       'bpFaced_per_svpt', 'mean_opponent_rank_points', 'count_M']

#### fitting estimators with reduced features
Using the reduced set of features, I repeated the fitting steps with GridSearchCV as before.

In [None]:

knn_gs = KNeighborsClassifier()
param_grid = {'classification__n_neighbors':range(1,16),'classification__weights':['uniform','distance']}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', knn_gs)
    ])

knn_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf)
    clf.fit(np.array(X_train),np.array(y_train))
    knn_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
logit_gs = LogisticRegression(max_iter=1000,solver='liblinear',random_state=87)
param_grid = {'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', logit_gs)
    ])

logit_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf)
    clf.fit(np.array(X_train),np.array(y_train))
    logit_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
tree_gs = DecisionTreeClassifier(random_state=87)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
              'classification__criterion':['gini','entropy']}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', tree_gs)
    ])
# 
tree_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    tree_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
forest_gs = RandomForestClassifier(random_state=87)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
              'classification__n_estimators':[10,50,100,200]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', forest_gs)
    ])

forest_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    forest_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
svm_gs = SVC(random_state=87,gamma='auto',probability=True)
param_grid = {'classification__kernel':['linear','rbf','sigmoid'],'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', svm_gs)
    ])

svm_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    svm_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
poly_gs = SVC(random_state=87,kernel='poly',gamma='auto',probability=True)
param_grid = {'classification__degree':[2,3,4],'classification__C':cs}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', poly_gs)
    ])

poly_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    poly_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
xt_gs = ExtraTreesClassifier(random_state=87,bootstrap=True)
param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],'classification__n_estimators':[10,50,100,200]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', xt_gs)
    ])

xt_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    xt_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
xg_gs = XGBClassifier(random_state=87,probability=True)
param_grid = {'classification__max_depth':range(1,13),'classification__learning_rate':[.1,.5,1],'classification__n_estimators':[50,100,500,1000]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', xg_gs)
    ])

xg_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    xg_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])[feat_subset]

fittings = {'knn':knn_clf,'logit':logit_clf,'tree':tree_clf,'forest':forest_clf,'svm':svm_clf,'poly':poly_clf,'xt':xt_clf,'xg':xg_clf}
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
results_clf = pd.DataFrame().from_dict(dict.fromkeys(map(sc,metrics),(dict.fromkeys(fittings,{}))))

for scorer,y in results_clf.items():
    
    for label in y.keys():
        fitting = fittings[label][scorer.split('_')[0]]

        Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(fitting.predict_proba(X_test)[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
        Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
        for tourney_id in unique_tourney_ids:
            unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
            count = 0
            while count < 16:
                pp_high = unique_pp[0]
                mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
                Xy_test_with_predictions.loc[mask,'predict'] = True
                unique_pp = unique_pp[1:]
                count += len(mask)

        results_clf.loc[label,scorer] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
        

In [None]:
results_clf

Reducing the set of features only slightly improves the estimator performance compared to the full feature set, but either maintains or improves each estimator. The best overall estimator ends up being a logistic regression with a reduced set of features.

In [None]:
list(zip((logit_clf['precision']['classification'].coef_).flatten(),feat_subset))

The seed, ranking, and points were still the most influential features, but interestingly, service game stats (double faults, break points faced, and first serves won) were revealed to be more important than seemingly trivial stats like player height and age.
#### bagging
I then tried another method about which I'd read: instead of reducing the feature set, bagging can reduce the variance. I took the best estimator from the GridSearch results and used that as my base estimator.

In [None]:
bag_gs = BaggingClassifier(base_estimator=xg_['recall']['classification'],random_state=87,)
param_grid = {'classification__n_estimators':[10,50,100,200],'classification__max_features':[.25,.5,.75,1.0],
             'classification__max_samples':[.25,.5,.75,1.0]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', bag_gs)
    ])

bag_ = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    bag_[str(scoring_metrics)] = clf.best_estimator_

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])

fittings = {'knn':knn_,'logit':logit_,'tree':tree_,'forest':forest_,'svm':svm_,'poly':poly_,'xt':xt_,'xg':xg_,'bag':bag_}
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
results_ = pd.DataFrame().from_dict(dict.fromkeys(map(sc,metrics),(dict.fromkeys(fittings,{}))))

for scorer,y in results_.items():
    
    for label in y.keys():
        fitting = fittings[label][scorer.split('_')[0]]

        Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(fitting.predict_proba(X_test)[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
        Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
        for tourney_id in unique_tourney_ids:
            unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
            count = 0
            while count < 16:
                pp_high = unique_pp[0]
                mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
                Xy_test_with_predictions.loc[mask,'predict'] = True
                unique_pp = unique_pp[1:]
                count += len(mask)

        results_.loc[label,scorer] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
        

In [None]:
results_

Indeed, the bootstrap aggregated best estimator slightly improved upon the result. I then repeated the bagging step for the reduced feature set.

In [None]:
bag_gs = BaggingClassifier(base_estimator=logit_clf['precision']['classification'],random_state=87,)

param_grid = {'classification__n_estimators':[10,50,100,200],'classification__max_features':[.25,.5,.75,1.0],
             'classification__max_samples':[.25,.5,.75,1.0]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', bag_gs)
    ])

bag_clf = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[feat_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    bag_clf[str(scoring_metrics)] = clf.best_estimator_

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])[feat_subset]

fittings = {'knn':knn_clf,'logit':logit_clf,'tree':tree_clf,'forest':forest_clf,'svm':svm_clf,'poly':poly_clf,'xt':xt_clf,'xg':xg_clf,'bag':bag_clf}
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
results_clf = pd.DataFrame().from_dict(dict.fromkeys(map(sc,metrics),(dict.fromkeys(fittings,{}))))

for scorer,y in results_clf.items():
    
    for label in y.keys():
        fitting = fittings[label][scorer.split('_')[0]]

        Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(fitting.predict_proba(X_test)[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
        Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
        for tourney_id in unique_tourney_ids:
            unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
            count = 0
            while count < 16:
                pp_high = unique_pp[0]
                mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
                Xy_test_with_predictions.loc[mask,'predict'] = True
                unique_pp = unique_pp[1:]
                count += len(mask)

        results_clf.loc[label,scorer] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
        

In [None]:
results_clf

The bagged best estimator from the reduced feature set yielded the best results so far.

### results

The fact that my best classifier underperformed the simple seeds left me wondering, who did it misclassify and why?
#### incorrect predictions with reduced features

I started by comparing the statistics of individual features that were true positives, false positives, and false negatives, starting with the features with greatest influence.

In [None]:
# import seaborn as sns

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])[feat_subset]
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

Xy_test_with_predictions = gs_draws.iloc[X_test.index].join(gs_draws_names).reset_index().join(pd.Series(bag_clf['precision'].predict_proba(X_test)[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','gs_player_seed'],ascending=True)#[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
Xy_test_with_predictions.insert(3,'seed_predict',np.ravel([[True]*16+[False]*(128-16)]*len(unique_tourney_ids)))


In [None]:
Xy_test_with_predictions = Xy_test_with_predictions.sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])
Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
for tourney_id in unique_tourney_ids:
    unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
    count = 0
    while count < 16:
        pp_high = unique_pp[0]
        mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
        Xy_test_with_predictions.loc[mask,'predict'] = True
        unique_pp = unique_pp[1:]
        count += len(mask)

In [None]:
list(zip((logit_clf['precision']['classification'].coef_).flatten(),feat_subset))

In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['gs_player_seed']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
## ax = sns.swarmplot(x='gs_player_seed',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='gs_player_seed',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='gs_player_seed',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('player seed',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax.figure.savefig('reduced_features_player_seed_boxplot.jpg')

In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['gs_player_rank']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='gs_player_rank',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='gs_player_rank',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='gs_player_rank',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('player rank',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['gs_player_rank_points']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='gs_player_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='gs_player_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='gs_player_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('player ranking points',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


Stats from the entrants' previous matches played didn't have the same clear delineation between true positives and false positives.

In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['bpFaced_per_svpt']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})

ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='bpFaced_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='bpFaced_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='bpFaced_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('break points faced per service point',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax.figure.savefig('reduced_features_break_points_faced_boxplot.jpg')

In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['1stWon_per_svpt']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='1stWon_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='1stWon_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='1stWon_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('1st serves won per service point',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['mean_opponent_rank_points']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='mean_opponent_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=0.75,s=8,marker='o',color='g',label='true positive')
ax = sns.swarmplot(x='mean_opponent_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='mean_opponent_rank_points',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('mean oppponent ranking points',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax = sns.boxplot(data=Xy_test_with_predictions[Xy_test_with_predictions.qf == True][['df_per_svpt']],orient='h',boxprops=dict(facecolor='w'),fliersize=3)
ax.set_title('R16 qualifiers determined by reduced feature set',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
# ax = sns.swarmplot(x='df_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=12,marker='o',linewidth=2,color='w',edgecolor='k',label='true positive')
ax = sns.swarmplot(x='df_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)],orient='h',alpha=1,s=10,marker='v',linewidth=1.5,color='w',edgecolor='g',label='false negative')
ax = sns.swarmplot(x='df_per_svpt',data=Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)],orient='h',alpha=1,s=10,marker='+',linewidth=2.5,color='g',label='false positive')
ax.set_xlabel('double faults per service point',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


#### stats only

If I want to explore the role of recent match stats, I have to remove the rankings/seeds/etc. that dominate the predictions and recalculate the estimators.

In [None]:
def find_stats_columns(s):
    if s.startswith('gs'):
        return False
    elif s.startswith('qf'):
        return False
    else:
        return True

stats_subset = list(filter(find_stats_columns,gs_draws.columns))

In [None]:
knn_gs = KNeighborsClassifier()
knn_param_grid = {'classification__n_neighbors':range(1,16),'classification__weights':['uniform','distance']}

cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
logit_gs = LogisticRegression(max_iter=1000,solver='liblinear',random_state=87)
logit_param_grid = {'classification__C':cs}

tree_gs = DecisionTreeClassifier(random_state=87)
tree_param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
                   'classification__criterion':['gini','entropy']}

forest_gs = RandomForestClassifier(random_state=87)
forest_param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
                     'classification__n_estimators':[10,50,100,200]}

cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
svm_gs = SVC(random_state=87,gamma='auto',probability=True)
svm_param_grid = {'classification__kernel':['linear','rbf','sigmoid'],'classification__C':cs}

cs = [10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3]
poly_gs = SVC(random_state=87,kernel='poly',gamma='auto',probability=True)
poly_param_grid = {'classification__degree':[2,3,4],'classification__C':cs}

xt_gs = ExtraTreesClassifier(random_state=87,bootstrap=True)
xt_param_grid = {'classification__max_depth':range(1,13),'classification__max_features':['auto','log2',None],
                 'classification__n_estimators':[10,50,100,200]}

xg_gs = XGBClassifier(random_state=87,probability=True)
xg_param_grid = {'classification__max_depth':range(1,13),'classification__learning_rate':[.1,.5,1],
                 'classification__n_estimators':[50,100,500]}
              
stats_fittings = {'knn':{'base':knn_gs,'params':knn_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'logit':{'base':logit_gs,'params':logit_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'tree':{'base':tree_gs,'params':tree_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'forest':{'base':forest_gs,'params':forest_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'svm':{'base':svm_gs,'params':svm_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'poly':{'base':poly_gs,'params':poly_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'xt':{'base':xt_gs,'params':xt_param_grid,'clf':{'precision':0,'recall':0,'f1':0}},
            'xg':{'base':xg_gs,'params':xg_param_grid,'clf':{'precision':0,'recall':0,'f1':0}}}


for f in stats_fittings.keys():
    model = Pipeline([('scaling',scaler),
                      ('sampling', ros),
                      ('classification', stats_fittings[f]['base'])])

    for scoring_metrics in ['recall','precision','f1']:
        X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
        gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
        X_train = X_train.drop(columns=['gs_tourney_id'])[stats_subset]

        clf = GridSearchCV(model,stats_fittings[f]['params'],scoring=scoring_metrics,cv=gkf,n_jobs=-2)
        clf.fit(np.array(X_train),np.array(y_train))
        stats_fittings[f]['clf'][scoring_metrics] = clf.best_estimator_

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])

unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
results_stats = pd.DataFrame().from_dict(dict.fromkeys(map(sc,metrics),(dict.fromkeys(stats_fittings,{}))))

for scorer,y in results_stats.items():
    
    for label in y.keys():
        fitting = stats_fittings[label]['clf'][scorer.split('_')[0]]

        Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(fitting.predict_proba(X_test[stats_subset])[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
        Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
        for tourney_id in unique_tourney_ids:
            unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
            count = 0
            while count < 16:
                pp_high = unique_pp[0]
                mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
                Xy_test_with_predictions.loc[mask,'predict'] = True
                unique_pp = unique_pp[1:]
                count += len(mask)

        results_stats.loc[label,scorer] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))
        

In [None]:
results_stats

As before, I'm taking the best estimator (logistic) and utilizing it as the base estimator in an ensemble.

In [None]:
bag_gs = BaggingClassifier(base_estimator=stats_fittings['logit']['clf']['f1'],random_state=87,)

param_grid = {'classification__n_estimators':[10,50,100,200],'classification__max_features':[.25,.5,.75,1.0],
             'classification__max_samples':[.25,.5,.75,1.0]}

model = Pipeline([('scaling',scaler),
        ('sampling', ros),
        ('classification', bag_gs)
    ])

bag_stats = {}

for scoring_metrics in ['recall','precision','f1']:
    X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
    gkf = group_kfold.split(X=X_train,y=y_train,groups=X_train.gs_tourney_id)
    X_train = X_train.drop(columns=['gs_tourney_id'])[stats_subset]

    clf = GridSearchCV(model,param_grid,scoring=scoring_metrics,cv=gkf,n_jobs=-2)
    clf.fit(np.array(X_train),np.array(y_train))
    bag_stats[str(scoring_metrics)] = clf.best_estimator_

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])

unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

def sc(n):
    return n+'_as_scorer'

metrics = ['precision','recall','f1']
s = {}

for scorer,y in bag_stats.items():

    Xy_test_with_predictions = gs_draws.iloc[X_test.index].reset_index().join(pd.Series(y.predict_proba(X_test[stats_subset])[:,1],name='predict_proba')).sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])[['index','gs_tourney_id','gs_player_seed','qf','predict_proba']].reset_index(drop=True)
    Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
            
    for tourney_id in unique_tourney_ids:
        unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
        count = 0
        while count < 16:
            pp_high = unique_pp[0]
            mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
            Xy_test_with_predictions.loc[mask,'predict'] = True
            unique_pp = unique_pp[1:]
            count += len(mask)

    s[sc(scorer)] = min(precision_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict),recall_score(Xy_test_with_predictions.qf,Xy_test_with_predictions.predict))

results_stats = results_stats.append(pd.Series(s,name='bag'))

In [None]:
results_stats

There was no improvement in scores with bagging, and all estimators are less successful than the reduced feature set from Boruta. Next, I compared the misclassified results from the stats-only subset of features versus the reduced feature set.

In [None]:
X_train,X_test,y_train,y_test = get_user_split_data(gs_draws,'gs_tourney_id','qf')
X_test = X_test.drop(columns=['gs_tourney_id'])
unique_tourney_ids = gs_draws.iloc[X_test.index].gs_tourney_id.unique()

Xy_test_with_predictions = gs_draws.iloc[X_test.index].join(gs_draws_names).reset_index().join(pd.Series(bag_clf['precision'].predict_proba(X_test[feat_subset])[:,1],name='predict_proba')).join(pd.Series(bag_stats['precision'].predict_proba(X_test[stats_subset])[:,1],name='stats_predict_proba')).sort_values(by=['gs_tourney_id','gs_player_seed'],ascending=True)
Xy_test_with_predictions.insert(3,'seed_predict',np.ravel([[True]*16+[False]*(128-16)]*len(unique_tourney_ids)))


In [None]:
Xy_test_with_predictions = Xy_test_with_predictions.sort_values(by=['gs_tourney_id','predict_proba'],ascending=[True,False])
Xy_test_with_predictions.insert(3,'predict',[False]*Xy_test_with_predictions.shape[0])
for tourney_id in unique_tourney_ids:
    unique_pp = np.sort(np.unique(Xy_test_with_predictions.predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
    count = 0
    while count < 16:
        pp_high = unique_pp[0]
        mask = Xy_test_with_predictions[(Xy_test_with_predictions.predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
        Xy_test_with_predictions.loc[mask,'predict'] = True
        unique_pp = unique_pp[1:]
        count += len(mask)

Xy_test_with_predictions = Xy_test_with_predictions.sort_values(by=['gs_tourney_id','stats_predict_proba'],ascending=[True,False])        
Xy_test_with_predictions.insert(3,'stats_predict',[False]*Xy_test_with_predictions.shape[0])
for tourney_id in unique_tourney_ids:
    unique_pp = np.sort(np.unique(Xy_test_with_predictions.stats_predict_proba[Xy_test_with_predictions.gs_tourney_id == tourney_id]))[::-1]
    count = 0
    while count < 16:
        pp_high = unique_pp[0]
        mask = Xy_test_with_predictions[(Xy_test_with_predictions.stats_predict_proba == pp_high) & (Xy_test_with_predictions.gs_tourney_id == tourney_id)].index
        Xy_test_with_predictions.loc[mask,'stats_predict'] = True
        unique_pp = unique_pp[1:]
        count += len(mask)


In [None]:
[(stats_fittings['logit']['clf']['precision']['classification'].coef_.flatten()[x],stats_subset[x]) for x in abs(stats_fittings['logit']['clf']['precision']['classification'].coef_.flatten()).argsort()][::-1]

In [None]:
[(logit_clf['precision']['classification'].coef_.flatten()[x],feat_subset[x]) for x in abs(logit_clf['precision']['classification'].coef_.flatten()).argsort()][::-1]

Looking at the feature importance values, in the absence of the seed/rank/points terms, the remaining features from the reduced set are of greater weight here. I then plotted histograms of the false positives and false negatives for each estimator (stats only, the reduced feature set, and seeds only) and found that while the stats only prediction was able to pick out 3 additional players outside the top 16 seeds who made it to the R16, it did so at the expense of many false positives that the other two feature sets did not misclassify.

In [None]:
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.stats_predict == False)]['gs_player_seed'],hist_kws={'ec':'black','fc':'blue','alpha':0.5},label="stats only",kde=False,bins=list(range(1,82,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == False)]['gs_player_seed'],hist_kws={'ec':'black','fc':'green','alpha':0.5},label="reduced features",kde=False,bins=list(range(1,82,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.seed_predict == False)]['gs_player_seed'],hist_kws={'fill':False,'linewidth':2,'ec':'black','alpha':0.5,'hatch':'x'},label='seeds',kde=False,bins=list(range(1,82,4)))

ax.set_title('R16 qualifying false negative predictions',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
ax.set_xlabel('player seed',fontsize=14)
ax.set_ylabel('count',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);


In [None]:
ax.figure.savefig('false_negatives.jpg')

In [None]:
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.stats_predict == True)]['gs_player_seed'],hist_kws={'ec':'black','fc':'blue','alpha':0.5},label="stats only",kde=False,bins=list(range(1,82,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.predict == True)]['gs_player_seed'],hist_kws={'fc':'green','ec':'black','alpha':0.5},label="reduced features",kde=False,bins=list(range(1,82,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == False) &(Xy_test_with_predictions.seed_predict == True)]['gs_player_seed'],hist_kws={'fill':False,'linewidth':2,'ec':'black','alpha':0.5,'hatch':'/'},label='seeds',kde=False,bins=list(range(1,82,4)))

ax.set_title('R16 qualifying false positive predictions',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
ax.set_xlabel('player seed',fontsize=14)
ax.set_ylabel('count',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);

In [None]:
ax.figure.savefig('false_positives.jpg')

In [None]:
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.stats_predict == True)]['gs_player_seed'],hist_kws={'ec':'black','fc':'blue','alpha':0.5},label="stats only",kde=False,bins=list(range(1,54,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.predict == True)]['gs_player_seed'],hist_kws={'fc':'green','ec':'black','alpha':0.5},label="reduced features",kde=False,bins=list(range(1,54,4)))
ax = sns.distplot(Xy_test_with_predictions[(Xy_test_with_predictions.qf == True) &(Xy_test_with_predictions.seed_predict == True)]['gs_player_seed'],hist_kws={'fill':False,'linewidth':2,'ec':'black','alpha':0.5,'hatch':'/'},label='seeds',kde=False,bins=list(range(1,54,4)))

ax.set_title('R16 qualifying true positive predictions',fontdict={'fontsize':16})
ax.figure.set_figheight(5)
ax.figure.set_figwidth(8)
ax.set_xlabel('player seed',fontsize=14)
ax.set_ylabel('count',fontsize=14)
ax.tick_params(labelsize=14)
ax.legend()
ax.legend_.prop.set_size(12);

#### 2020 Australian Open

In [None]:
Xy_2020 = gs_draws_2020[['gs_player_seed','qf']].join(gs_draws_names_2020).reset_index(drop=True).join(pd.Series(bag_clf['precision'].predict_proba(gs_draws_2020[feat_subset])[:,1],name='predict_proba')).join(pd.Series(bag_stats['precision'].predict_proba(gs_draws_2020[stats_subset])[:,1],name='stats_predict_proba')).sort_values(by='gs_player_seed',ascending=True)
Xy_2020.insert(3,'seed_predict',[True]*16+[False]*(128-16))
Xy_2020.insert(3,'predict',False)
Xy_2020.insert(3,'stats_predict',False)
Xy_2020 = Xy_2020.sort_values(by='predict_proba',ascending=False).reset_index(drop=True)

The seeded predictions outperformed my estimator comparing both precision and recall. Both outperformed the estimator without the seeds, points, and rankings.

In [None]:
unique_pp = np.sort(np.unique(Xy_2020.predict_proba))[::-1]
count = 0
while count < 16:
    pp_high = unique_pp[0]
    mask = Xy_2020[Xy_2020.predict_proba == pp_high].index
    Xy_2020.loc[mask,'predict'] = True
    unique_pp = unique_pp[1:]
    count += len(mask)
    
unique_pp = np.sort(np.unique(Xy_2020.stats_predict_proba))[::-1]
count = 0
while count < 16:
    pp_high = unique_pp[0]
    mask = Xy_2020[Xy_2020.stats_predict_proba == pp_high].index
    Xy_2020.loc[mask,'stats_predict'] = True
    unique_pp = unique_pp[1:]
    count += len(mask)

print('reduced feature set estimator results: precision=',precision_score(Xy_2020.qf,Xy_2020.predict),'recall=',recall_score(Xy_2020.qf,Xy_2020.predict))
print('stats estimator results: precision=',precision_score(Xy_2020.qf,Xy_2020.stats_predict),'recall=',recall_score(Xy_2020.qf,Xy_2020.stats_predict))
print('seed results: precision=',precision_score(Xy_2020.qf,Xy_2020.seed_predict),'recall=',recall_score(Xy_2020.qf,Xy_2020.seed_predict))