In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from collections import OrderedDict
from datetime import datetime, timedelta
import time
import math
import random

from joblib import dump, load
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix

from scipy.stats import rankdata
from tabulate import tabulate

# Get All Input Files

In [2]:
#input_path = './predict_inputs/'
input_path = './predict_inputs/'
input_files = [input_path + file for file in os.listdir(input_path) if file.endswith('DRF')]
print(input_files)

['./predict_inputs/PIM0518.DRF']


# Add Number of Entrants to Input File

In [3]:
def add_entrants(input_file):
    '''
        Find highest post position for each race and use as number of entrants
        
        Args:
            file (string): path to results file
            
        Returns:
            Nothing
    '''
    # Load files 
    input_df = pd.read_csv(input_file, header=None)
    
    # Find how many races are in each
    race_col = 2
    num_input_races = input_df[race_col].max()
    
    # For each race, count entrants, append as last column
    input_last_col = input_df.columns.max() + 1
    race_entrants = {}
    for race in range(1,num_input_races+1):
        entrants = input_df.loc[input_df[race_col] == race].shape[0]
        iloc = input_df.loc[input_df[race_col] == race].index
        input_df.loc[iloc, input_last_col] = pd.Series(entrants, index=iloc)
        race_entrants[race] = entrants
        
    # Save back to file
    input_df.to_csv(input_file, header=False, index=False)
    
for file in input_files:
    add_entrants(file)

# Parse Files for Input Features

In [4]:
input_map = OrderedDict({
        1: 'date',
        2: 'race_num',
        3: 'post_pos',
        1435: 'num_entrants',
        44: 'horse_name',
        42: 'entry',
        855: 'last_speed_rating',
        216: 'speed_par',
        33: 'app_weight_alw',
        64: 'starts_at_dist',
        96: 'lt_starts',
        100: 'lt_earnings',
        43: 'ml_odds',
        28: 'trainer_starts',
        29: 'trainer_wins',
        34: 'jockey_starts',
        35: 'jockey_wins'
    })

# Iterate through all input files
master_df = pd.DataFrame()
for file in input_files:
    # Open files to dataframe -- Take only columns that are necessary
    input_cols = [k for k in input_map.keys()]
    input_tmp = pd.read_csv(file, header=None)[input_cols]
    
    # Rename cols
    input_tmp.columns = [input_map[col] for col in input_tmp.columns]    
    
    # Get Track Code
    track_code = file[file.rfind('/')+1:][:3]
    input_tmp['track_code'] = track_code
    
    # Add these inputs/results to dataframes
    master_df = master_df.append(input_tmp)
    
master_df.head()

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,ml_odds,horse_name,jockey_starts,jockey_wins,last_speed_rating,speed_par,num_entrants,trainer_starts,trainer_wins,track_code
0,1,20190518,1,1,19410,,8,1,12.0,SHELLY ISLAND,9,1,92.0,78,16.0,5,1,PIM
1,1,20190518,1,2,7371,,1,2,4.0,FRIENDLY FIRE,6,0,89.0,78,16.0,8,0,PIM
2,0,20190518,1,3,3760,,5,3,30.0,BROOKLYN SARDY,1,0,70.0,78,16.0,1,0,PIM
3,1,20190518,1,4,7388,,7,4,10.0,NO WORRIES MATE,8,3,59.0,78,16.0,4,1,PIM
4,0,20190518,1,5,1080,,1,5,20.0,WILLIE THE WHALE,7,0,69.0,78,16.0,1,0,PIM


# Derive Additional Features

In [5]:
def get_jockey_win_pct(df):
    df['jockey_win_pct'] = df['jockey_wins'] / df['jockey_starts']
    return df.drop(['jockey_wins', 'jockey_starts'], axis=1)

def get_trainer_win_pct(df):
    df['trainer_win_pct'] = df['trainer_wins'] / df['trainer_starts']
    return df.drop(['trainer_wins', 'trainer_starts'], axis=1)


# Calculate jockey/trainer win percentages
master_df = get_jockey_win_pct(master_df)
master_df = get_trainer_win_pct(master_df)

# Clean NaN cols
master_df = master_df.fillna(value=0)
master_df.head()

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,ml_odds,horse_name,last_speed_rating,speed_par,num_entrants,track_code,jockey_win_pct,trainer_win_pct
0,1,20190518,1,1,19410,0.0,8,1,12.0,SHELLY ISLAND,92.0,78,16.0,PIM,0.111111,0.2
1,1,20190518,1,2,7371,0.0,1,2,4.0,FRIENDLY FIRE,89.0,78,16.0,PIM,0.0,0.0
2,0,20190518,1,3,3760,0.0,5,3,30.0,BROOKLYN SARDY,70.0,78,16.0,PIM,0.0,0.0
3,1,20190518,1,4,7388,0.0,7,4,10.0,NO WORRIES MATE,59.0,78,16.0,PIM,0.375,0.25
4,0,20190518,1,5,1080,0.0,1,5,20.0,WILLIE THE WHALE,69.0,78,16.0,PIM,0.0,0.0


# Get Past Performance Data

In [6]:
# Map for past performance fields
input_keys = OrderedDict({
    1: 'date',
    2: 'race_num',
    44: 'horse_name',
})

input_pp_map = OrderedDict({
    615 : 'pp_finish_pos',
    345 : 'pp_num_entrants',
    5: 'todays_distance',
    315: 'pp_distance',
    #535 : 'pp_race_class',
    255 : 'pp_race_date',
    1045: 'pp_claimed',
    1125: 'pp_favorite',
    605: 'pp_stretch_pos',
    113: 'pp_workout_time',
    465: 'pp_winners_margin'
})

def get_past_performance_data(input_files, num_races=3):
    '''
        Go through input files and get past performance data
        
        Args: 
            input_files (list): list of input file names 
            num_races (int): number of races back to grab data for
            
        Returns:
            pd.DataFrame with past performance data for each horse
    '''
    pp_df = pd.DataFrame()
    
    # Create new mapping for this number of races
    updated_pp_map = OrderedDict({k+race:'{}_{}'.format(v, race) for k,v in input_pp_map.items() for race in range(num_races)})
    
    for file in input_files:
        # Open input file to df
        df = pd.read_csv(file, header=None)
        
        # Only include key/past performance columns
        incl_cols = list(input_keys.keys()) + list(updated_pp_map.keys())
        df = df[incl_cols]
        
        # Rename columns
        renamed_cols = []
        for col in df.columns:
            try:
                # PP Column
                rn_col = updated_pp_map[col]
            except:
                # Key Column 
                rn_col = input_keys[col]    
            renamed_cols.append(rn_col)
        df.columns = renamed_cols
        
        # Append this file's df to pp_df
        pp_df = pp_df.append(df)
    
    return pp_df

num_races = 3
pp_df = get_past_performance_data(input_files, num_races)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,pp_race_date_1,pp_race_date_2,todays_distance_0,todays_distance_1,todays_distance_2,pp_winners_margin_0,pp_winners_margin_1,...,pp_finish_pos_1,pp_finish_pos_2,pp_favorite_2,pp_workout_time_0,pp_workout_time_1,pp_workout_time_2,pp_distance_0,pp_distance_1,pp_distance_2,pp_race_date_0
0,20190518,1,SHELLY ISLAND,20181018.0,20180928.0,1100,T,,0.25,2.0,...,7.0,3.0,0.0,62.6,49.6,50.8,1210.0,1760.0,1100.0,20190425.0
1,20190518,1,FRIENDLY FIRE,,,1100,T,,0.13,,...,,,,61.8,62.0,63.0,1210.0,,,20190425.0
2,20190518,1,BROOKLYN SARDY,20190120.0,20181208.0,1100,T,,0.25,5.25,...,10.0,9.0,0.0,51.6,39.4,63.2,1210.0,1760.0,1210.0,20190216.0
3,20190518,1,NO WORRIES MATE,20181017.0,20180927.0,1100,T,,8.0,0.5,...,3.0,6.0,0.0,49.0,50.0,49.6,1320.0,1540.0,1760.0,20181208.0
4,20190518,1,WILLIE THE WHALE,,,1100,T,,5.75,,...,,,,48.2,49.0,49.6,1320.0,,,20190329.0


# Clean Past Performance Data

In [7]:
def clean_finish_pos_cols(pp_df, num_races=3):
        # Get finish position column names
        finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
        
        def clean_val(val):
            # If value is numeric, return its integer rep
            try:
                return int(val)
            except:
                # If value not numeric, return 0 (did not finish)
                return int(92)
            
        # Clean columns
        for col in finish_pos_cols:
            pp_df[col] = pp_df[col].apply(clean_val)
            
        return pp_df
    
def clean_stretch_pos_cols(pp_df, num_races=3):
        # Get stretch position column names
        stretch_pos_cols = ['pp_stretch_pos_{}'.format(race) for race in range(num_races)]
        
        def clean_val(val):
            # If value is numeric, return its integer rep
            try:
                return int(val)
            except:
                # If value not numeric, return 92 (did not finish)
                return int(92)
            
        
        # Clean columns
        for col in stretch_pos_cols:
            pp_df[col] = pp_df[col].apply(clean_val)
            
        return pp_df
    
def clean_favorite_cols(pp_df, num_races=3):
    def conv(val):
        if math.isnan(val):
            return int(0)
        else:
            return int(val)
        
    # Get favorite column names
    favorite_columns = ['pp_favorite_{}'.format(race) for race in range(num_races)]
    
    # Convert to integers
    for col in favorite_columns:
        pp_df[col] = pp_df[col].apply(conv)
        
    return pp_df
    
    
def clean_pp_df(pp_df, num_races=3):
    '''
        Clean fields in pp_df
        
        Args:
            pp_df (pd.DataFrame): dataframe to clean
            
        Returns:
            (pd.DataFrame) cleaned version of pp_df
    '''
    pp_df = clean_finish_pos_cols(pp_df, num_races)
    pp_df = clean_stretch_pos_cols(pp_df, num_races)
    pp_df = clean_favorite_cols(pp_df, num_races)
    
    return pp_df

pp_df = clean_pp_df(pp_df, num_races)
pp_df.head(10)

Unnamed: 0,date,race_num,horse_name,pp_race_date_1,pp_race_date_2,todays_distance_0,todays_distance_1,todays_distance_2,pp_winners_margin_0,pp_winners_margin_1,...,pp_finish_pos_1,pp_finish_pos_2,pp_favorite_2,pp_workout_time_0,pp_workout_time_1,pp_workout_time_2,pp_distance_0,pp_distance_1,pp_distance_2,pp_race_date_0
0,20190518,1,SHELLY ISLAND,20181018.0,20180928.0,1100,T,,0.25,2.0,...,7,3,0,62.6,49.6,50.8,1210.0,1760.0,1100.0,20190425.0
1,20190518,1,FRIENDLY FIRE,,,1100,T,,0.13,,...,92,92,0,61.8,62.0,63.0,1210.0,,,20190425.0
2,20190518,1,BROOKLYN SARDY,20190120.0,20181208.0,1100,T,,0.25,5.25,...,10,9,0,51.6,39.4,63.2,1210.0,1760.0,1210.0,20190216.0
3,20190518,1,NO WORRIES MATE,20181017.0,20180927.0,1100,T,,8.0,0.5,...,3,6,0,49.0,50.0,49.6,1320.0,1540.0,1760.0,20181208.0
4,20190518,1,WILLIE THE WHALE,,,1100,T,,5.75,,...,92,92,0,48.2,49.0,49.6,1320.0,,,20190329.0
5,20190518,1,FERNWOOD DRIVE,20190308.0,,1100,T,,0.25,0.5,...,6,92,0,46.6,48.6,-48.0,1320.0,1320.0,,20190425.0
6,20190518,1,SECOND ENCORE,20190413.0,20190222.0,1100,T,,0.13,6.25,...,7,8,0,49.0,36.4,50.6,1210.0,1320.0,1760.0,20190425.0
7,20190518,1,TURNSTYLE,20180706.0,20170714.0,1100,T,,0.06,2.0,...,5,2,0,61.2,49.0,50.0,1320.0,1540.0,1100.0,20180727.0
8,20190518,1,CYCLOBOMB,20190316.0,20190127.0,1100,T,,2.5,1.25,...,2,6,0,61.8,75.6,62.0,1760.0,1540.0,1320.0,20190418.0
9,20190518,1,TOP HAT HUSTLE,,,1100,T,,,,...,92,92,0,50.0,37.0,37.2,,,,


# Derive Metrics with Cleaned Data

In [8]:
def get_improved_finish_pos(pp_df, num_races=3):
    '''
        Calculate if horse is improving its finish position over the past races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            
        Returns:
            (pd.DataFrame) an updated version of pp_df with a column for improved
            finish position
    '''
    
    def all_seq_vals_less_than_one(seq):
        ''' Returns true if all values in the passed sequence are <=1 '''
        for elt in seq:
            if elt > 1:
                return False
        return True

    # Get past finish pos cols
    finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
    # Concat all finish positions for each horse in a sequential list starting with most
    # recent race 
    finish_pos_seqs = pp_df[finish_pos_cols].values.tolist()
    
    imp_finish_pos = []
    for seq in finish_pos_seqs:
        # Skip any sequence where NaN appears
        if np.nan in seq:
            imp_finish_pos.append(False)
        # Skip any seq where horse didn't finish (fractional finish pos > 1)
        elif not all_seq_vals_less_than_one(seq):
            imp_finish_pos.append(False)
        else:
            # Determine if last finish was better than previous average finish
            most_recent = seq[0]
            avg_prev_finish = np.mean(seq[1:])
            imp_finish_pos.append(most_recent < avg_prev_finish)
            
    # Assign improved finish position series to pp_df
    pp_df['imp_finish_pos'] = pd.Series(imp_finish_pos)
    
    return pp_df

def get_recent_race(pp_df, threshold=21):
    '''
        Determine if a horse has had a previous race within threshold days
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            threshold (int): days to look back
            
        Returns:
            (pd.DataFrame) updated pp_df containing bool column stating whether
            each horse had a recent race or not
    '''
    
    def conv_date(val):
        # Set NaNs to 1970
        if math.isnan(float(val)):
            val = 19700101
        val = str(int(val))
        return datetime.strptime(val, '%Y%m%d')
    
    # Determine if race most recent race is within threshold
    pp_df['today'] = pp_df['date'].apply(conv_date)
    pp_df['last_race'] = pp_df['pp_race_date_0'].apply(conv_date)
    pp_df['timedelta'] = pd.Series(timedelta(days=threshold), index=pp_df.index)
    pp_df['cutoff'] = pp_df['today'] - pp_df['timedelta']
    pp_df['recent_race'] = pp_df['cutoff'] <= pp_df['last_race'] # Cutoff happened before last race
    
    # Drop intermediate columns
    drop_cols = ['today', 'last_race', 'timedelta', 'cutoff']
    return pp_df.drop(drop_cols, axis=1)
    
def get_past_finish_pos(pp_df, num_races=3):
    '''
        Calculate previous finish positions as finish_pos/num_entrants
        
        Args:
            pp_df (pd.DataFrame) past performance dataframe
            num_races (int): number of races for which we have past performance
        
        Returns:
            (pd.DataFrame) updated pp_df with finish positions calculated
    '''
    # Get finish position/entrants columns names
    finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
    entrants_cols = ['pp_num_entrants_{}'.format(race) for race in range(num_races)]
    cols = list(zip(finish_pos_cols, entrants_cols))
    
    # Divide finish pos by num entrants, save a raw copy of finish position
    for f_col, e_col in cols:
        pp_df['raw_'+f_col] = pp_df[f_col]
        pp_df[f_col] = pp_df[f_col] / pp_df[e_col]
    
    return pp_df

def get_claimed_in_past(pp_df, num_races=3):
    '''
        Determine if a horse was claimed in the past num_races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Claimed column names
    claimed_cols = ['pp_claimed_{}'.format(race) for race in range(num_races)]
    claimed_seqs = pp_df[claimed_cols].values.tolist()
    
    # Determine if claimed in recent races    
    claimed = [True if 'c' in seq else False for seq in claimed_seqs]
    
    # Assign to pp_df
    pp_df['was_claimed'] = pd.Series(claimed, index=pp_df.index).fillna(False)
    
    return pp_df

def was_favorite(pp_df, num_races=3):
    '''
        Determine if a horse was the favorite in any of its recent races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Get favorite columns
    favorite_cols = ['pp_favorite_{}'.format(race) for race in range(num_races)]
    favorite_seqs = pp_df[favorite_cols].values.tolist()
    
    # If 1 in sequence, horse was favorite in recent race
    was_favorite = [True if 1 in seq else False for seq in favorite_seqs]
    
    pp_df['was_favorite'] = pd.Series(was_favorite, index=pp_df.index)
    
    return pp_df

def improved_stretch_pos(pp_df, num_races=3):
    '''
        Determine if a horse improved its finish position down the stretch in the most
        recent race
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Stretch position columns
    stretch_pos_cols = ['pp_stretch_pos_{}'.format(race) for race in range(num_races)]
    
    # Get fractional stretch position
    pp_df['pp_stretch_pos'] = pp_df['pp_stretch_pos_0'] / pp_df['pp_num_entrants_0']
    
    # Determine if stretch pos > finish pos
    pp_df['improved_stretch_pos'] = pp_df['pp_stretch_pos'] > pp_df['pp_finish_pos_0']
    
    return pp_df.drop(['pp_stretch_pos'], axis=1)

def had_bullet_workout(pp_df, num_races=3):
    '''
        Determine if a horse had a bullet workout in previous workout
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Get workout time columns
    workout_cols = ['pp_workout_time_{}'.format(race) for race in range(num_races)]
    
    # Determine if last workout was bullet
    pp_df['had_bullet'] = pp_df['pp_workout_time_0'] < 0
    
    return pp_df

def won_by_margin(pp_df, num_races=3, margin=5):
    '''
        Determine if a horse won last race by at least margin lengths
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            margin (float): number of lengths by which horse had to win
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Find all winners of last race
    winners = pp_df.loc[pp_df.raw_pp_finish_pos_0 == 1]
    
    # Find all winners with at least margin length victory
    big_winners = winners.loc[winners.pp_winners_margin_0 >= margin].index
    
    # Create new col, default to false. Set True for all horses in big_winners index
    pp_df['won_by_margin'] = pd.Series(False, index=pp_df.index)
    pp_df.loc[big_winners, 'won_by_margin'] = True
    
    return pp_df

def won_at_similar_distance(pp_df, num_races=3):
    '''
        Determine if a horse won a recent race at the same distance as today's race
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Get distance and finish pos cols
    distance_cols = ['pp_distance_{}'.format(race) for race in range(num_races)]
    raw_finish_cols = ['raw_pp_finish_pos_{}'.format(race) for race in range(num_races)]
    
    # Find winners at similar distance by iterating through each recent race
    winners = []
    for race in range(num_races):
        dist_col = 'pp_distance_{}'.format(race)
        finish_col = 'raw_pp_finish_pos_{}'.format(race)
        today_dist_col = 'todays_distance_0'
        
        winners += pp_df.loc[(pp_df[dist_col] == pp_df[today_dist_col]) & (pp_df[finish_col] == 1)].index.tolist()
        
    # Create new column to denote if won at similar distance -- default False, True for 
    # horses in winners
    pp_df['won_at_similar_dist'] = pd.Series(False, index=pp_df.index)
    pp_df.loc[winners, 'won_at_similar_dist'] = True

    return pp_df
    
def derive_pp_metrics(pp_df, num_races=3):
    # Past Finish Positions
    pp_df = get_past_finish_pos(pp_df, num_races)
    # Improved Finish Position
    pp_df = get_improved_finish_pos(pp_df, num_races)
    # Recent race
    pp_df = get_recent_race(pp_df, 21)
    # Claimed in Past
    pp_df = get_claimed_in_past(pp_df, num_races)
    # Favorite in past
    pp_df = was_favorite(pp_df, num_races)
    # Improved stretch pos
    pp_df = improved_stretch_pos(pp_df, num_races)
    # Workout Rating
    pp_df = had_bullet_workout(pp_df, num_races)
    # Won last race by 5+ lengths
    pp_df = won_by_margin(pp_df, num_races, margin=4)
    # Won at similar distance
    pp_df = won_at_similar_distance(pp_df, num_races)    
    
    return pp_df
    
pp_df = derive_pp_metrics(pp_df)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,pp_race_date_1,pp_race_date_2,todays_distance_0,todays_distance_1,todays_distance_2,pp_winners_margin_0,pp_winners_margin_1,...,raw_pp_finish_pos_1,raw_pp_finish_pos_2,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,20190518,1,SHELLY ISLAND,20181018.0,20180928.0,1100,T,,0.25,2.0,...,7,3,True,False,False,False,True,False,False,False
1,20190518,1,FRIENDLY FIRE,,,1100,T,,0.13,,...,92,92,False,False,False,False,False,False,False,False
2,20190518,1,BROOKLYN SARDY,20190120.0,20181208.0,1100,T,,0.25,5.25,...,10,9,False,False,False,False,False,False,False,False
3,20190518,1,NO WORRIES MATE,20181017.0,20180927.0,1100,T,,8.0,0.5,...,3,6,False,False,False,False,True,False,False,False
4,20190518,1,WILLIE THE WHALE,,,1100,T,,5.75,,...,92,92,False,False,False,False,False,False,False,False


# Clean Fields used in Derivations

In [9]:
def clean_deriv_columns(pp_df, num_races=3):
    '''
        Drop all columns used as intermediates during feature derivation from pp_df
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            
        Returns:
            (pd.DataFrame): a cleaned version of pp_df
    '''
    column_dict = {
        'claimed_cols' : ['pp_claimed_{}'.format(race) for race in range(num_races)],
        'date_cols' : ['pp_race_date_{}'.format(race) for race in range(num_races)],
        'distance_cols' : ['pp_distance_{}'.format(race) for race in range(num_races)],
        'entrants_cols' : ['pp_num_entrants_{}'.format(race) for race in range(num_races)],   
        'favorite_cols' : ['pp_favorite_{}'.format(race) for race in range(num_races)],
        'finish_pos_cols' : ['pp_finish_pos_{}'.format(race) for race in range(num_races)],
        'margin_cols' : ['pp_winners_margin_{}'.format(race) for race in range(num_races)],
        'raw_finish_cols' : ['raw_pp_finish_pos_{}'.format(race) for race in range(num_races)],
        'stretch_pos_cols' : ['pp_stretch_pos_{}'.format(race) for race in range(num_races)],
        'todays_distance_cols' : ['todays_distance_{}'.format(race) for race in range(num_races)],
        'workout_cols' : ['pp_workout_time_{}'.format(race) for race in range(num_races)],
    }
    
    for k, cols in column_dict.items():
        pp_df = pp_df.drop(cols, axis=1)
        
    return pp_df

pp_df = clean_deriv_columns(pp_df, num_races)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,20190518,1,SHELLY ISLAND,True,False,False,False,True,False,False,False
1,20190518,1,FRIENDLY FIRE,False,False,False,False,False,False,False,False
2,20190518,1,BROOKLYN SARDY,False,False,False,False,False,False,False,False
3,20190518,1,NO WORRIES MATE,False,False,False,False,True,False,False,False
4,20190518,1,WILLIE THE WHALE,False,False,False,False,False,False,False,False


# Merge Past Performance with Input Features

In [10]:
master_df = pd.merge(master_df, pp_df,
                     how='left', left_on=['date', 'race_num', 'horse_name'],
                     right_on=['date','race_num', 'horse_name'])
master_df.head()

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,ml_odds,horse_name,...,jockey_win_pct,trainer_win_pct,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,1,20190518,1,1,19410,0.0,8,1,12.0,SHELLY ISLAND,...,0.111111,0.2,True,False,False,False,True,False,False,False
1,1,20190518,1,2,7371,0.0,1,2,4.0,FRIENDLY FIRE,...,0.0,0.0,False,False,False,False,False,False,False,False
2,0,20190518,1,3,3760,0.0,5,3,30.0,BROOKLYN SARDY,...,0.0,0.0,False,False,False,False,False,False,False,False
3,1,20190518,1,4,7388,0.0,7,4,10.0,NO WORRIES MATE,...,0.375,0.25,False,False,False,False,True,False,False,False
4,0,20190518,1,5,1080,0.0,1,5,20.0,WILLIE THE WHALE,...,0.0,0.0,False,False,False,False,False,False,False,False


# Convert Boolean Columns to Binary

In [11]:
bool_cols = ['imp_finish_pos', 'recent_race', 'was_claimed','was_favorite', \
             'improved_stretch_pos', 'had_bullet', 'won_by_margin', \
             'won_at_similar_dist']

for col in bool_cols:
    master_df[col] = master_df[col].apply(lambda x: int(x))

master_df.head()

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,ml_odds,horse_name,...,jockey_win_pct,trainer_win_pct,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,1,20190518,1,1,19410,0.0,8,1,12.0,SHELLY ISLAND,...,0.111111,0.2,1,0,0,0,1,0,0,0
1,1,20190518,1,2,7371,0.0,1,2,4.0,FRIENDLY FIRE,...,0.0,0.0,0,0,0,0,0,0,0,0
2,0,20190518,1,3,3760,0.0,5,3,30.0,BROOKLYN SARDY,...,0.0,0.0,0,0,0,0,0,0,0,0
3,1,20190518,1,4,7388,0.0,7,4,10.0,NO WORRIES MATE,...,0.375,0.25,0,0,0,0,1,0,0,0
4,0,20190518,1,5,1080,0.0,1,5,20.0,WILLIE THE WHALE,...,0.0,0.0,0,0,0,0,0,0,0,0


# Standardize Data

In [12]:
def standardize_series(s):
    return (s - s.mean()) / s.std()

# Standardize all columns except explicit columns
no_std = ['race_num', 'date', 'horse_name', 'track_code', 'odds', 'top3', 't3_conf', 'win_conf', 'win', 'entry', 'h2h']
master_df['odds'] = master_df['ml_odds']
master_df = master_df.drop(['ml_odds'], axis=1)
std_cols = [col for col in master_df.columns if col not in no_std]

for col in std_cols:
    master_df[col] = standardize_series(master_df[col]).fillna(0)

# Load Models from Disk

In [13]:
itm_file = './models/itm_gbc.joblib'
win_file = './models/win_gbc.joblib'
h2h_file = './models/h2h_gbc.joblib'

itm_model = load(itm_file)
win_model = load(win_file)
h2h_model = load(h2h_file)

In [14]:
# DO ITM and WIN predictions
for idx, row in master_df.iterrows():
    print('Entry: {}/{}'.format(idx+1, master_df.shape[0]), end='\r', flush=True)
    
    # First predict if the horse will be in top 3
    top3_pred = itm_model.predict_proba(row[std_cols].values.reshape(1,-1))
    master_df.loc[idx, 'top3'] = np.argmax(top3_pred)
    master_df.loc[idx, 't3_conf'] = np.max(top3_pred)
    
    # Predict if winner
    win_pred = win_model.predict_proba(row[std_cols].values.reshape(1,-1))
    master_df.loc[idx, 'win'] = np.argmax(win_pred)
    master_df.loc[idx, 'win_conf'] = np.max(win_pred)
    
master_df.head(20)

Entry: 144/144

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,horse_name,last_speed_rating,...,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist,odds,top3,t3_conf,win,win_conf
0,-0.508623,20190518,1,-1.37744,-0.631925,-0.219656,-0.348597,1,SHELLY ISLAND,0.651907,...,-0.806612,1.664362,-0.41176,-0.168443,-0.511205,12.0,0.0,0.631036,0.0,0.519293
1,-0.508623,20190518,1,-1.104229,-0.681367,-0.219656,-1.088356,2,FRIENDLY FIRE,0.499842,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,4.0,0.0,0.69087,0.0,0.768742
2,-0.789242,20190518,1,-0.831018,-0.696197,-0.219656,-0.665637,3,BROOKLYN SARDY,-0.463234,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,30.0,0.0,0.697155,0.0,0.766321
3,-0.508623,20190518,1,-0.557806,-0.681297,-0.219656,-0.454277,4,NO WORRIES MATE,-1.020805,...,-0.806612,1.664362,-0.41176,-0.168443,-0.511205,10.0,0.0,0.532079,1.0,0.588549
4,-0.789242,20190518,1,-0.284595,-0.707203,-0.219656,-1.088356,5,WILLIE THE WHALE,-0.513922,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,20.0,0.0,0.697155,0.0,0.766321
5,-0.789242,20190518,1,-0.011384,-0.699646,5.748347,-0.982676,6,FERNWOOD DRIVE,0.145025,...,1.231144,-0.596658,-0.41176,-0.168443,-0.511205,3.5,0.0,0.560992,0.0,0.570004
6,0.052616,20190518,1,0.261827,-0.683757,-0.219656,0.074123,7,SECOND ENCORE,0.347778,...,-0.806612,1.664362,-0.41176,-0.168443,-0.511205,15.0,0.0,0.602197,0.0,0.709167
7,-0.789242,20190518,1,0.535039,-0.628476,-0.219656,-0.771317,8,TURNSTYLE,0.094336,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,8.0,0.0,0.675818,0.0,0.785179
8,-0.508623,20190518,1,0.80825,-0.610011,2.764346,-0.454277,9,CYCLOBOMB,-0.361858,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,10.0,0.0,0.703405,0.0,0.740457
9,-0.789242,20190518,1,1.081461,-0.711638,3.957946,-1.194036,10,TOP HAT HUSTLE,-4.011411,...,-0.806612,-0.596658,-0.41176,-0.168443,-0.511205,15.0,0.0,0.645555,0.0,0.607449


# Get Horse-to-Horse Comparisons

In [15]:
def find_horse_index(date, race_num, track_code, post_pos):
    return master_df.loc[(master_df.date == date) &
                         (master_df.race_num == race_num) &
                         (master_df.track_code == track_code) &
                         (master_df.post_pos == post_pos)].index[0]


# Get all races
unique_races = master_df.groupby(['date', 'race_num', 'track_code']).size().reset_index()
# Add h2h columns to dataframe
master_df['h2h_count'] = pd.Series(0, index=master_df.index)
master_df['h2h'] = pd.Series('', index=master_df.index)

# Iterate through each race
for ii, row in unique_races.iterrows():
    print('Race: {}/{}'.format(ii+1, len(unique_races)), end='\r', flush=True)
    # Get race characteristics
    date = row.date
    track_code = row.track_code
    race_num = row.race_num
    # Get master_df slice for this race
    race_df = master_df.loc[(master_df.date == date) &
                           (master_df.race_num == race_num) &
                           (master_df.track_code == track_code)]
    # Only want to do comparisions between horses once. So we will loop through
    # the first half (ceiling) of post positions, and will compare to all other
    # post positions within that loop. This will result in each combination of 
    # post positions only being evaluated once.
    post_positions = sorted(race_df.post_pos)
    for ii, h1_pos in enumerate(post_positions):
        # Iterate through all other post positions
        for h2_pos in post_positions:
            # Don't compare to self or any positions that have already been evaluated
            if h1_pos <= h2_pos:
                continue
                
            # Get horses' data and merge
            h1 = race_df.loc[race_df.post_pos == h1_pos]
            h2 = race_df.loc[race_df.post_pos == h2_pos]
            
            h1_merge = h1.merge(h2, left_on=['date', 'race_num', 'track_code'],
                                              right_on=['date', 'race_num', 'track_code'])
            h2_merge = h2.merge(h1, left_on=['date', 'race_num', 'track_code'],
                                              right_on=['date', 'race_num', 'track_code'])
            
            # Drop columns not used by h2h model
            cols_to_drop = set(['date', 'race_num', 'track_code', 'horse_name_x', 'horse_name_y', 'num_entrants_y',\
                                'odds_x', 'odds_y', 'entry_x', 'entry_y','h2h', 'win_x', 'win_y', 'top3_x', 'top3_y',\
                                't3_conf_x', 't3_conf_y', 'win_conf_x', 'win_conf_y', 'h2h_count_x', 'h2h_count_y',\
                                'h2h_x', 'h2h_y'])
            cols = list(set(h1_merge.columns) - cols_to_drop)
            h1_data = h1_merge[cols]
            h2_data = h2_merge[cols]
            
            # Run horses through model
            h1_h2h = h2h_model.predict_proba(h1_data)[0]
            h2_h2h = h2h_model.predict_proba(h2_data)[0]
            
            # Get prediction and confidence level for each horse
            h1_pred = np.argmax(h1_h2h)
            h1_conf = h1_h2h[h1_pred]
            h2_pred = np.argmax(h2_h2h)
            h2_conf = h2_h2h[h2_pred]
            
            # If labels are different, then increment count for each horse based
            # solely on label
            h1_idx = find_horse_index(date, race_num, track_code, h1_pos)
            h2_idx = find_horse_index(date, race_num, track_code, h2_pos)
            if h1_pred != h2_pred:
                master_df.loc[[h1_idx], 'h2h_count'] += h1_pred
                master_df.loc[[h2_idx], 'h2h_count'] += h2_pred
            else:
                # Give label priority to horse with higher prediction confidence
                if h1_conf >= h2_conf:
                    master_df.loc[[h1_idx], 'h2h_count'] += h1_pred
                    master_df.loc[[h2_idx], 'h2h_count'] += 1 if h2_pred == 0 else 0
                else:
                    master_df.loc[[h1_idx], 'h2h_count'] += 1 if h1_pred == 0 else 0
                    master_df.loc[[h2_idx], 'h2h_count'] += h2_pred
                    
    # Turn counts into fractions
    master_df.loc[race_df.index, 'h2h'] = \
                        master_df.loc[race_df.index]['h2h_count'].apply(lambda x: '{}/{}'.format(x, len(race_df)-1))
        
master_df = master_df.drop(['h2h_count'], axis=1)
master_df.head()

Race: 14/14

Unnamed: 0,starts_at_dist,date,race_num,post_pos,lt_earnings,app_weight_alw,lt_starts,entry,horse_name,last_speed_rating,...,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist,odds,top3,t3_conf,win,win_conf,h2h
0,-0.508623,20190518,1,-1.37744,-0.631925,-0.219656,-0.348597,1,SHELLY ISLAND,0.651907,...,1.664362,-0.41176,-0.168443,-0.511205,12.0,0.0,0.631036,0.0,0.519293,1/15
1,-0.508623,20190518,1,-1.104229,-0.681367,-0.219656,-1.088356,2,FRIENDLY FIRE,0.499842,...,-0.596658,-0.41176,-0.168443,-0.511205,4.0,0.0,0.69087,0.0,0.768742,8/15
2,-0.789242,20190518,1,-0.831018,-0.696197,-0.219656,-0.665637,3,BROOKLYN SARDY,-0.463234,...,-0.596658,-0.41176,-0.168443,-0.511205,30.0,0.0,0.697155,0.0,0.766321,9/15
3,-0.508623,20190518,1,-0.557806,-0.681297,-0.219656,-0.454277,4,NO WORRIES MATE,-1.020805,...,1.664362,-0.41176,-0.168443,-0.511205,10.0,0.0,0.532079,1.0,0.588549,4/15
4,-0.789242,20190518,1,-0.284595,-0.707203,-0.219656,-1.088356,5,WILLIE THE WHALE,-0.513922,...,-0.596658,-0.41176,-0.168443,-0.511205,20.0,0.0,0.697155,0.0,0.766321,10/15


# Print Predictions

In [16]:
# Run through day/track combination
track_date_combos = master_df.groupby(['date','track_code']).size().reset_index() # df

for idx, row in track_date_combos.iterrows():
    # Create a separate table for each race
    track_code = row['track_code']
    date = row['date']
    track_df = master_df.loc[(master_df.date == date) & (master_df.track_code == track_code)]
    cols_to_keep = ['horse_name', 'entry', 'win', 'win_conf','h2h', 'odds']#, 'top3', 't3_conf']
    out = ''
    for num in range(1, track_df.race_num.max()+1):
        # Get data for this race
        race_df = track_df.loc[track_df['race_num'] == num][cols_to_keep]
        race_df['ml_rank'] = rankdata(race_df['odds'], method='min')
        # Get predicted winners first and sort by confidence level
        win_df = race_df.loc[race_df.win == 1]
        win_df = win_df.sort_values(['win_conf'], ascending=False)
        # Get not predicted winners and sort by h2h
        h2h_df = race_df.loc[race_df.win == 0]
        h2h_df = h2h_df.sort_values('h2h', ascending=False)
        
        race_df = win_df.append(h2h_df)       
        
        out += 'RACE: {}'.format(num)
        out += '\n'
        out += tabulate(race_df, headers='keys', tablefmt='psql', showindex=False)
        out += '\n\n'

    filename= './predictions/{}{}_predictions.txt'.format(track_code, date)    
    output_file = open(filename, 'w+')
    output_file.write(out)
    output_file.close()

# Save Predictions to CSV
Save to csv to do betting analysis

In [17]:
#cols_to_keep = ['date', 'track_code', 'race_num', 'horse_name', 'entry', 'win', 'win_conf','h2h', 'odds']
#preds_df = master_df[cols_to_keep]

#filename = './predictions/predictions.csv'
#preds_df.to_csv(filename, index=False)

In [18]:
#df = pd.read_csv(filename)
#df.head()