In [1]:
import pandas as pd
import numpy as np
import os
from collections import OrderedDict
from datetime import datetime, timedelta
import math

import torch
from torch import nn, optim
import torch.nn.functional as F

from tqdm import tqdm

# Find All Pairs of Input/Results Files

In [2]:
def find_file_pairs(input_path, results_path):
    '''
        Find matching input/output files
        
        Args:
            input_path (string): relative path to input files
            results_path (string): relative path to results files
            
        Returns:
            (list) list of (input_file,results_file) pairs
    '''
    # Get a list of all input files and results files
    input_files = [input_path + file for file in os.listdir(input_path) if file.endswith(".DRF")]
    results_files = [results_path + file for file in os.listdir(results_path) if file.endswith('.CHT')]
    
    # Get names of files w/o directories/extensions
    input_names = sorted([file[len(input_path) : -4] for file in input_files])
    results_names = sorted([file[len(results_path) : -4] for file in results_files])
    
    # Find matches 
    matches = [name for name in input_names if name in results_names]
    
    # Create list of input/results file pairs -- [(input_file, output_file),(...),...]
    file_pairs = [('{}{}.DRF'.format(input_path, name), '{}{}.CHT'.format(results_path, name)) for name in matches]
    
    return file_pairs

file_pairs = find_file_pairs(input_path='./input_files/', results_path='./results_files/')

In [3]:
file_pairs

[('./input_files/FPK04232019.DRF', './results_files/FPK04232019.CHT'),
 ('./input_files/IND04232019.DRF', './results_files/IND04232019.CHT')]

# Add Number of Entrants to Results Files

In [4]:
def add_results_entrants(file):
    '''
        Find highest post position for each race and use as number of entrants
        
        Args:
            file (string): path to results file
            
        Returns:
            Nothing
    '''
    # Load results to dataframe
    df = pd.read_csv(file, header=None)
    
    # Find how many races are in this results file
    race_col = 2
    num_races = df[race_col].max()
    
    # For each race, count entrants, append as last column
    last_col = df.columns.max() + 1
    for race in range(1,num_races+1):
        entrants = df.loc[df[race_col] == race].shape[0]
        iloc = df.loc[df[race_col] == race].index
        df.loc[iloc, last_col] = pd.Series(entrants, index=iloc)
        
    # Save results back to file
    df.to_csv(file, header=False, index=False)

    
# Iterate through all results files in file_pairs and add number of entrants
for pair in file_pairs:
    res_file = pair[1]
    add_results_entrants(res_file)

# Parse Input and Results Files for Relevant Fields
These fields do not include past performance data -- these past performance fields will be handled by a separate process.

In [5]:
# Maps for input/results file structures -- colnum:colname
input_map = OrderedDict({
        1: 'date',
        2: 'race_num',
        44: 'horse_name',
        855: 'last_speed_rating',
        216: 'speed_par',
        33: 'app_weight_alw',
        100: 'lt_earnings',
        43: 'ml_odds',
        28: 'trainer_starts',
        29: 'trainer_wins',
        34: 'jockey_starts',
        35: 'jockey_wins'
    })

results_map = OrderedDict({
    1: 'date',
    2: 'race_num',
    19: 'horse_name',
    29: 'finish_pos',
    17: 'post_pos', 
    48: 'num_entrants'
})

In [6]:
input_df = pd.DataFrame()
results_df = pd.DataFrame()

for pair in file_pairs:
    #Get input/results file names
    input_file = pair[0]
    results_file = pair[1]
    
    # Open files to dataframe -- Take only columns that are necessary
    input_cols = [k for k in input_map.keys()]
    results_cols = [k for k in results_map.keys()]
    input_tmp = pd.read_csv(input_file, header=None)[input_cols]
    results_tmp = pd.read_csv(results_file, header=None)[results_cols]
    
    # Rename cols
    input_tmp.columns = [input_map[col] for col in input_tmp.columns]
    results_tmp.columns = [results_map[col] for col in results_tmp.columns]
    
    # Add these inputs/results to dataframes
    input_df = input_df.append(input_tmp)
    results_df = results_df.append(results_tmp)
    
# Merge input and results dfs together
master_df = pd.merge(input_df, results_df,
                     how='left', left_on=['date', 'race_num', 'horse_name'],
                     right_on=['date','race_num', 'horse_name'])
master_df.head()

Unnamed: 0,date,race_num,jockey_wins,lt_earnings,app_weight_alw,ml_odds,horse_name,jockey_starts,last_speed_rating,speed_par,trainer_starts,trainer_wins,num_entrants,post_pos,finish_pos
0,20190423,1,1,66843,,3.0,TRYST CAT,6,81.0,80.0,0,0,4.0,1.0,4.0
1,20190423,1,0,70304,,0.8,MY FOUROONE KPLAN,0,77.0,80.0,2,1,4.0,2.0,1.0
2,20190423,1,2,60185,,2.0,CAROL'S A CASE,7,81.0,80.0,0,0,4.0,3.0,3.0
3,20190423,1,0,39680,,12.0,RUN AWAY GAL,8,56.0,80.0,4,0,4.0,4.0,2.0
4,20190423,2,2,2360,,3.0,CEE R BEE,7,75.0,74.0,1,1,5.0,1.0,1.0


In [7]:
def get_jockey_win_pct(df):
    df['jockey_win_pct'] = df['jockey_wins'] / df['jockey_starts']
    return df.drop(['jockey_wins', 'jockey_starts'], axis=1)

def get_trainer_win_pct(df):
    df['trainer_win_pct'] = df['trainer_wins'] / df['trainer_starts']
    return df.drop(['trainer_wins', 'trainer_starts'], axis=1)

# Calculate jockey/trainer win percentages
master_df = get_jockey_win_pct(master_df)
master_df = get_trainer_win_pct(master_df)

# Clean NaN cols
master_df = master_df.fillna(value=0)
master_df.head()

Unnamed: 0,date,race_num,lt_earnings,app_weight_alw,ml_odds,horse_name,last_speed_rating,speed_par,num_entrants,post_pos,finish_pos,jockey_win_pct,trainer_win_pct
0,20190423,1,66843,0.0,3.0,TRYST CAT,81.0,80.0,4.0,1.0,4.0,0.166667,0.0
1,20190423,1,70304,0.0,0.8,MY FOUROONE KPLAN,77.0,80.0,4.0,2.0,1.0,0.0,0.5
2,20190423,1,60185,0.0,2.0,CAROL'S A CASE,81.0,80.0,4.0,3.0,3.0,0.285714,0.0
3,20190423,1,39680,0.0,12.0,RUN AWAY GAL,56.0,80.0,4.0,4.0,2.0,0.0,0.0
4,20190423,2,2360,0.0,3.0,CEE R BEE,75.0,74.0,5.0,1.0,1.0,0.285714,1.0


# Get Past Performance Data

In [8]:
# Map for past performance fields
input_keys = OrderedDict({
    1: 'date',
    2: 'race_num',
    44: 'horse_name',
})

input_pp_map = OrderedDict({
    615 : 'pp_finish_pos',
    345 : 'pp_num_entrants',
    5: 'todays_distance',
    315: 'pp_distance',
    #535 : 'pp_race_class',
    255 : 'pp_race_date',
    1045: 'pp_claimed',
    1125: 'pp_favorite',
    605: 'pp_stretch_pos',
    113: 'pp_workout_time',
    465: 'pp_winners_margin'
})

name_to_pp_col = OrderedDict({v:k for k,v in input_pp_map.items()})

In [21]:
def get_past_performance_data(input_files, num_races=3):
    '''
        Go through input files and get past performance data
        
        Args: 
            input_files (list): list of input file names 
            num_races (int): number of races back to grab data for
            
        Returns:
            pd.DataFrame with past performance data for each horse
    '''
    pp_df = pd.DataFrame()
    
    # Create new mapping for this number of races
    updated_pp_map = OrderedDict({k+race:'{}_{}'.format(v, race) for k,v in input_pp_map.items() for race in range(num_races)})
    
    for file in input_files:
        # Open input file to df
        df = pd.read_csv(file, header=None)
        
        # Only include key/past performance columns
        incl_cols = list(input_keys.keys()) + list(updated_pp_map.keys())
        df = df[incl_cols]
        
        # Rename columns
        renamed_cols = []
        for col in df.columns:
            try:
                # PP Column
                rn_col = updated_pp_map[col]
            except:
                # Key Column 
                rn_col = input_keys[col]    
            renamed_cols.append(rn_col)
        df.columns = renamed_cols
        
        # Append this file's df to pp_df
        pp_df = pp_df.append(df)
    
    return pp_df

input_files = [pair[0] for pair in file_pairs]
num_races = 3
pp_df = get_past_performance_data(input_files, num_races)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,pp_race_date_1,pp_race_date_2,todays_distance_0,todays_distance_1,todays_distance_2,pp_winners_margin_0,pp_winners_margin_1,...,pp_finish_pos_1,pp_finish_pos_2,pp_favorite_2,pp_workout_time_0,pp_workout_time_1,pp_workout_time_2,pp_distance_0,pp_distance_1,pp_distance_2,pp_race_date_0
0,20190423,1,TRYST CAT,20181101.0,20181010.0,1100,D,,4.25,0.13,...,3.0,1.0,0.0,49.0,50.8,-48.8,1320.0,1430.0,1320.0,20181117.0
1,20190423,1,MY FOUROONE KPLAN,20180811.0,20180724.0,1100,D,,4.0,1.25,...,1.0,4.0,1.0,-36.2,38.0,38.0,1320.0,1320.0,1320.0,20190406.0
2,20190423,1,CAROL'S A CASE,20180915.0,20180825.0,1100,D,,3.5,8.25,...,4.0,1.0,1.0,-35.6,50.6,39.2,1320.0,1320.0,1320.0,20181011.0
3,20190423,1,RUN AWAY GAL,20181011.0,20180821.0,1100,D,,2.75,3.5,...,8.0,5.0,0.0,36.6,49.0,39.6,1830.0,1320.0,1320.0,20181031.0
4,20190423,2,CEE R BEE,20181123.0,20181101.0,1100,D,,1.25,3.0,...,6.0,5.0,0.0,-61.2,63.8,50.0,1320.0,1320.0,1320.0,20190406.0


# Clean Past Performance DataFrame

In [22]:
def clean_finish_pos_cols(pp_df, num_races=3):
        # Get finish position column names
        finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
        
        def clean_val(val):
            # If value is numeric, return its integer rep
            try:
                return int(val)
            except:
                # If value not numeric, return 92 (did not finish)
                return int(92)
            
        # Clean columns
        for col in finish_pos_cols:
            pp_df[col] = pp_df[col].apply(clean_val)
            
        return pp_df
    
def clean_stretch_pos_cols(pp_df, num_races=3):
        # Get stretch position column names
        stretch_pos_cols = ['pp_stretch_pos_{}'.format(race) for race in range(num_races)]
        
        def clean_val(val):
            # If value is numeric, return its integer rep
            try:
                return int(val)
            except:
                # If value not numeric, return 92 (did not finish)
                return int(92)
            
        
        # Clean columns
        for col in stretch_pos_cols:
            pp_df[col] = pp_df[col].apply(clean_val)
            
        return pp_df
    
def clean_favorite_cols(pp_df, num_races=3):
    def conv(val):
        if math.isnan(val):
            return int(0)
        else:
            return int(val)
        
    # Get favorite column names
    favorite_columns = ['pp_favorite_{}'.format(race) for race in range(num_races)]
    
    # Convert to integers
    for col in favorite_columns:
        pp_df[col] = pp_df[col].apply(conv)
        
    return pp_df
    
    
def clean_pp_df(pp_df, num_races=3):
    '''
        Clean fields in pp_df
        
        Args:
            pp_df (pd.DataFrame): dataframe to clean
            
        Returns:
            (pd.DataFrame) cleaned version of pp_df
    '''
    pp_df = clean_finish_pos_cols(pp_df, num_races)
    pp_df = clean_stretch_pos_cols(pp_df, num_races)
    pp_df = clean_favorite_cols(pp_df, num_races)
    
    return pp_df

pp_df = clean_pp_df(pp_df, num_races)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,pp_race_date_1,pp_race_date_2,todays_distance_0,todays_distance_1,todays_distance_2,pp_winners_margin_0,pp_winners_margin_1,...,pp_finish_pos_1,pp_finish_pos_2,pp_favorite_2,pp_workout_time_0,pp_workout_time_1,pp_workout_time_2,pp_distance_0,pp_distance_1,pp_distance_2,pp_race_date_0
0,20190423,1,TRYST CAT,20181101.0,20181010.0,1100,D,,4.25,0.13,...,3,1,0,49.0,50.8,-48.8,1320.0,1430.0,1320.0,20181117.0
1,20190423,1,MY FOUROONE KPLAN,20180811.0,20180724.0,1100,D,,4.0,1.25,...,1,4,1,-36.2,38.0,38.0,1320.0,1320.0,1320.0,20190406.0
2,20190423,1,CAROL'S A CASE,20180915.0,20180825.0,1100,D,,3.5,8.25,...,4,1,1,-35.6,50.6,39.2,1320.0,1320.0,1320.0,20181011.0
3,20190423,1,RUN AWAY GAL,20181011.0,20180821.0,1100,D,,2.75,3.5,...,8,5,0,36.6,49.0,39.6,1830.0,1320.0,1320.0,20181031.0
4,20190423,2,CEE R BEE,20181123.0,20181101.0,1100,D,,1.25,3.0,...,6,5,0,-61.2,63.8,50.0,1320.0,1320.0,1320.0,20190406.0


# Derive Past Performance Metrics

In [23]:
def get_improved_finish_pos(pp_df, num_races=3):
    '''
        Calculate if horse is improving its finish position over the past races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            
        Returns:
            (pd.DataFrame) an updated version of pp_df with a column for improved
            finish position
    '''
    
    def all_seq_vals_less_than_one(seq):
        ''' Returns true if all values in the passed sequence are <=1 '''
        for elt in seq:
            if elt > 1:
                return False
        return True

    # Get cols relevant to past finish position
    finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
    # Concat all finish positions in a sequential list starting with most recent race for each horse
    finish_pos_seqs = pp_df[finish_pos_cols].values.tolist()
    
    imp_finish_pos = []
    for seq in finish_pos_seqs:
        # Skip any sequence where NaN appears
        if np.nan in seq:
            imp_finish_pos.append(False)
        # Skip any seq where horse didn't finish (fractional finish pos > 1)
        elif not all_seq_vals_less_than_one(seq):
            imp_finish_pos.append(False)
        else:
            most_recent = seq[0]
            avg_prev_finish = np.mean(seq[1:])
            imp_finish_pos.append(most_recent < avg_prev_finish)
            
    # Assign improved finish position series to pp_df
    pp_df['imp_finish_pos'] = pd.Series(imp_finish_pos)
    
    return pp_df

def get_recent_race(pp_df, threshold=21):
    '''
        Determine if a horse has had a previous race within threshold days
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            threshold (int): days to look back
            
        Returns:
            (pd.DataFrame) updated pp_df containing bool column stating whether
            each horse had a recent race or not
    '''
    
    def conv_date(val):
        if math.isnan(float(val)):
            val = 19700101
        val = str(int(val))
        return datetime.strptime(val, '%Y%m%d')
    
    # Get date columns
    date_cols = ['pp_race_date_{}'.format(race) for race in range(num_races)]
    
    # Determine if race most recent race is within threshold
    pp_df['today'] = pp_df['date'].apply(conv_date)
    pp_df['last_race'] = pp_df['pp_race_date_0'].apply(conv_date)
    pp_df['timedelta'] = pd.Series(timedelta(days=threshold), index=pp_df.index)
    pp_df['cutoff'] = pp_df['today'] - pp_df['timedelta']
    pp_df['recent_race'] = pp_df['cutoff'] <= pp_df['last_race']
    
    # Drop intermediate columns
    drop_cols = ['today', 'last_race', 'timedelta', 'cutoff'] + date_cols
    return pp_df.drop(drop_cols, axis=1)
    
def get_past_finish_pos(pp_df, num_races=3):
    '''
        Calculate previous finish positions as finish_pos/num_entrants
        
        Args:
            pp_df (pd.DataFrame) past performance dataframe
            num_races (int): number of races for which we have past performance
        
        Returns:
            (pd.DataFrame) updated pp_df with finish positions calculated
    '''
    # Get finish position/entrants columns names
    finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
    entrants_cols = ['pp_num_entrants_{}'.format(race) for race in range(num_races)]
    cols = list(zip(finish_pos_cols, entrants_cols))
    
    # Divide finish pos by num entrants, save a raw copy of finish position
    for f_col, e_col in cols:
        pp_df['raw_'+f_col] = pp_df[f_col]
        pp_df[f_col] = pp_df[f_col] / pp_df[e_col]
    
    return pp_df

def get_claimed_in_past(pp_df, num_races=3):
    '''
        Determine if a horse was claimed in the past num_races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Claimed column names
    claimed_cols = ['pp_claimed_{}'.format(race) for race in range(num_races)]
    claimed_seqs = pp_df[claimed_cols].values.tolist()
    
    # Determine if claimed in recent races    
    claimed = [True if 'c' in seq else False for seq in claimed_seqs]
    
    # Assign to pp_df
    pp_df['was_claimed'] = pd.Series(claimed, index=pp_df.index)
    
    # Drop claimed cols
    return pp_df.drop(claimed_cols, axis=1)

def was_favorite(pp_df, num_races=3):
    '''
        Determine if a horse was the favorite in any of its recent races
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Get favorite columns
    favorite_cols = ['pp_favorite_{}'.format(race) for race in range(num_races)]
    favorite_seqs = pp_df[favorite_cols].values.tolist()
    
    # If 1 in sequence, horse was favorite in recent race
    was_favorite = [True if 1 in seq else False for seq in favorite_seqs]
    
    pp_df['was_favorite'] = pd.Series(was_favorite, index=pp_df.index)
    
    # Drop favorite cols
    return pp_df.drop(favorite_cols, axis=1)

def improved_stretch_pos(pp_df, num_races=3):
    '''
        Determine if a horse improved its finish position down the stretch in the most
        recent race
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Stretch position columns
    stretch_pos_cols = ['pp_stretch_pos_{}'.format(race) for race in range(num_races)]
    
    # Get fractional stretch position
    pp_df['pp_stretch_pos'] = pp_df['pp_stretch_pos_0'] / pp_df['pp_num_entrants_0']
    
    # Determine if stretch pos > finish pos
    pp_df['improved_stretch_pos'] = pp_df['pp_stretch_pos'] > pp_df['pp_finish_pos_0']
        
    # Drop unnecessary columns
    finish_pos_cols = ['pp_finish_pos_{}'.format(race) for race in range(num_races)]
    entrants_cols = ['pp_num_entrants_{}'.format(race) for race in range(num_races)]
    drop_cols = stretch_pos_cols + ['pp_stretch_pos'] + finish_pos_cols + entrants_cols
    return pp_df.drop(drop_cols, axis=1)

def had_bullet_workout(pp_df, num_races=3):
    '''
        Determine if a horse had a bullet workout in previous workout
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
    '''
    # Get workout time columns
    workout_cols = ['pp_workout_time_{}'.format(race) for race in range(num_races)]
    
    # Determine if last workout was bullet
    pp_df['had_bullet'] = pp_df['pp_workout_time_0'] < 0
    
    # Drop workout columns
    return pp_df.drop(workout_cols, axis=1)

def won_by_margin(pp_df, num_races=3, margin=5):
    '''
        Determine if a horse won last race by at least margin lengths
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            margin (float): number of lengths by which horse had to win
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Find all winners of last race
    winners = pp_df.loc[pp_df.raw_pp_finish_pos_0 == 1]
    
    # Find all winners with at least margin length victory
    big_winners = winners.loc[winners.pp_winners_margin_0 >= margin].index
    
    # Create new col, default to false. Set True for all horses in big_winners index
    pp_df['won_by_margin'] = pd.Series(False, index=pp_df.index)
    pp_df.loc[big_winners, 'won_by_margin'] = True
    
    # Drop intermediate cols
    margin_cols = ['pp_winners_margin_{}'.format(race) for race in range(num_races)]
    drop_cols = margin_cols
    return pp_df.drop(drop_cols, axis=1)

def won_at_similar_distance(pp_df, num_races=3):
    '''
        Determine if a horse won a recent race at the same distance as today's race
        
        Args:
            pp_df (pd.DataFrame): past performance dataframe
            num_races (int): number of races for which we have past performances
            
        Returns:
            (pd.DataFrame) updated version of pp_df
    '''
    # Get distance and finish pos cols
    distance_cols = ['pp_distance_{}'.format(race) for race in range(num_races)]
    raw_finish_cols = ['raw_pp_finish_pos_{}'.format(race) for race in range(num_races)]
    
    # Find winners at similar distance by iterating through each recent race
    winners = []
    for race in range(num_races):
        dist_col = 'pp_distance_{}'.format(race)
        finish_col = 'raw_pp_finish_pos_{}'.format(race)
        today_dist_col = 'todays_distance_0'
        
        winners += pp_df.loc[(pp_df[dist_col] == pp_df[today_dist_col]) & (pp_df[finish_col] == 1)].index.tolist()
        
    # Create new column to denote if won at similar distance -- default False, True for 
    # horses in winners
    pp_df['won_at_similar_dist'] = pd.Series(False, index=pp_df.index)
    pp_df.loc[winners, 'won_at_similar_dist'] = True
    
    # Drop unnecessary cols
    todays_distance_cols = ['todays_distance_{}'.format(race) for race in range(num_races)]
    drop_cols = distance_cols + todays_distance_cols + raw_finish_cols    
    return pp_df.drop(drop_cols, axis=1)
    
def derive_pp_metrics(pp_df, num_races=3):
    # Past Finish Positions
    pp_df = get_past_finish_pos(pp_df, num_races)
    # Improved Finish Position
    pp_df = get_improved_finish_pos(pp_df, num_races)
    # Recent race
    pp_df = get_recent_race(pp_df, 21)
    # Claimed in Past
    pp_df = get_claimed_in_past(pp_df, num_races)
    # Favorite in past
    pp_df = was_favorite(pp_df, num_races)
    # Improved stretch pos
    pp_df = improved_stretch_pos(pp_df, num_races)
    # Workout Rating
    pp_df = had_bullet_workout(pp_df, num_races)
    # Won last race by 5+ lengths
    pp_df = won_by_margin(pp_df, num_races, margin=4)
    # Won at similar distance
    pp_df = won_at_similar_distance(pp_df, num_races)    
    
    return pp_df
    
pp_df = derive_pp_metrics(pp_df)
pp_df.head()

Unnamed: 0,date,race_num,horse_name,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,20190423,1,TRYST CAT,False,False,False,False,True,False,False,False
1,20190423,1,MY FOUROONE KPLAN,True,True,False,True,False,True,True,False
2,20190423,1,CAROL'S A CASE,False,False,False,True,False,True,False,False
3,20190423,1,RUN AWAY GAL,True,False,False,False,False,False,False,False
4,20190423,2,CEE R BEE,True,True,False,False,False,True,False,False


# Merge Past Performance with Master DF

In [24]:
master_df = pd.merge(master_df, pp_df,
                     how='left', left_on=['date', 'race_num', 'horse_name'],
                     right_on=['date','race_num', 'horse_name'])
master_df.head()

Unnamed: 0,date,race_num,lt_earnings,app_weight_alw,ml_odds,horse_name,last_speed_rating,speed_par,num_entrants,post_pos,...,jockey_win_pct,trainer_win_pct,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist
0,20190423,1,66843,0.0,3.0,TRYST CAT,81.0,80.0,4.0,1.0,...,0.166667,0.0,False,False,False,False,True,False,False,False
1,20190423,1,70304,0.0,0.8,MY FOUROONE KPLAN,77.0,80.0,4.0,2.0,...,0.0,0.5,True,True,False,True,False,True,True,False
2,20190423,1,60185,0.0,2.0,CAROL'S A CASE,81.0,80.0,4.0,3.0,...,0.285714,0.0,False,False,False,True,False,True,False,False
3,20190423,1,39680,0.0,12.0,RUN AWAY GAL,56.0,80.0,4.0,4.0,...,0.0,0.0,True,False,False,False,False,False,False,False
4,20190423,2,2360,0.0,3.0,CEE R BEE,75.0,74.0,5.0,1.0,...,0.285714,1.0,True,True,False,False,False,True,False,False


# Add Label 

In [25]:
def add_label(master_df):
    master_df['label'] = master_df['finish_pos'] / master_df['num_entrants']
    return master_df

master_df = add_label(master_df)
master_df.head()

Unnamed: 0,date,race_num,lt_earnings,app_weight_alw,ml_odds,horse_name,last_speed_rating,speed_par,num_entrants,post_pos,...,trainer_win_pct,imp_finish_pos,recent_race,was_claimed,was_favorite,improved_stretch_pos,had_bullet,won_by_margin,won_at_similar_dist,label
0,20190423,1,66843,0.0,3.0,TRYST CAT,81.0,80.0,4.0,1.0,...,0.0,False,False,False,False,True,False,False,False,1.0
1,20190423,1,70304,0.0,0.8,MY FOUROONE KPLAN,77.0,80.0,4.0,2.0,...,0.5,True,True,False,True,False,True,True,False,0.25
2,20190423,1,60185,0.0,2.0,CAROL'S A CASE,81.0,80.0,4.0,3.0,...,0.0,False,False,False,True,False,True,False,False,0.75
3,20190423,1,39680,0.0,12.0,RUN AWAY GAL,56.0,80.0,4.0,4.0,...,0.0,True,False,False,False,False,False,False,False,0.5
4,20190423,2,2360,0.0,3.0,CEE R BEE,75.0,74.0,5.0,1.0,...,1.0,True,True,False,False,False,True,False,False,0.2
