In [1]:
import pandas as pd
import numpy as np
import os
from collections import OrderedDict

import torch
from torch import nn, optim
import torch.nn.functional as F

from tqdm import tqdm

# Pre-process DRF Files

In [29]:
def generate_master_df(path, num_races=3):
    '''
        Generate the master dataframe from which we will create our training/testing data
        
        Args:
            path (string): Path to directory containing DRF files to parse
            num_races (int): Number of races to use in each sequence (how many races back
                             are we looking?)
        
        Returns: Dataframe containing all data from each DRF concatted together
    '''
    # Cap num_races
    num_races = min(num_races, 9) # Only have max of 9 prev race's data
    
    # Get all DRF files in data directory
    filenames = [path+file for file in os.listdir(path) if file.endswith(".DRF")]
    
    # Iterate through each file and concat data to master df
    master_df = None
    for ii, file in tqdm(enumerate(filenames)): 
        if ii == 0:
            # First pass through just create master df
            df = pd.read_csv(file, header=None)
            master_df = slice_df(df, num_races)
        else:
            # All other passes, append sliced dataframe to master
            df = pd.read_csv(file, header=None)
            df = slice_df(df, num_races)
            master_df = master_df.append(df, ignore_index=True)
            
    # Drop all rows containing NaN values (these horses didn't have enough prev races)
    master_df = master_df.dropna().reset_index().drop(['index'], axis=1)
    # Derive additional fields from input data
    master_df = derive_fields(master_df, num_races)
    # Clean data and return
    return clean_data(master_df, num_races)

def slice_df(df, num_races=3):
    '''
        This will take the 1400 field dataframe created from parsing input file CSVs 
        and cut it down to only include information relevant to training the network
        
        Args:
            df (pd.DataFrame): the unedited, 1400 field dataframe
            num_races (int): the number of races into the past we are gathering data for
            
        Returns:
            pd.DataFrame containing only fields relevant to network training
    '''
    # Define columns to grab
    column_ids = OrderedDict({
        'horse_age': (45,46),
        'horse_name': (44,45),
        'lifetime_starts': (96, 97),
        'lifetime_wins': (97, 98),
        'lifetime_places': (98, 99),
        'lifetime_shows': (99, 100),
        'days_since_prev_race': (265, 265+num_races),
        'distance': (315, 315+num_races),
        'num_entrants': (345, 345+num_races),
        'post_position': (355, 355+num_races),
        'weight': (505, 505+num_races),
        'winner_name': (405, 405+num_races),
        'place_name': (415, 415+num_races),
        'show_name': (425, 425+num_races),
        'label': (1035, 1036) # Finish time of current race
    })

    # Select all of our column ranges
    rng = []
    col_names = []
    for k,v in column_ids.items():
        # Append range to rng -- special case for single field
        if v[1] - v[0] == 1:
            for i in range(num_races):
                rng += [v[0]]
                col_names.append('{}_{}'.format(k, i))
        else:
            # Handle column ranges
            rng += range(v[0],v[1])
            for ii in range(v[0], v[1]):
                col_names.append('{}_{}'.format(k, ii-v[0]))

    # Slice df on columns
    ret = df.loc[:, rng]
    ret.columns = col_names
    return ret

def derive_fields(df, num_races=3):
    '''
        Derive fields such as lifetime wins/places/shows at each race in the past
        and return dataframe containing derived fields, and not containing fields
        required to do derivation.
        
        Args:
            df (pd.DataFrame): DataFrame containing all fields necessary for derivations
            num_races (int): the number of races into the past we are looking (includes today)
            
        Returns:
            pd.DataFrame containing all data required to train network
    '''
    # Can skip Race 0 for all derived metrics since these are already calculated for us in the lifetime
    # stats fields
    drop_columns = []
    for race in range(0, num_races):
        # Calculate starts
        start_col = 'lifetime_starts_{}'.format(race)
        df[start_col] = df['lifetime_starts_0'] - race
        
        # Determine if a horse got win/place/show for previous races
        win_col = 'winner_name_{}'.format(race)
        place_col = 'place_name_{}'.format(race)
        show_col = 'show_name_{}'.format(race)
        horse_name_col = 'horse_name_{}'.format(race)
        
        # Can skip race 0, but still want to remove its intermediate cols
        if race != 0:
            horse_won_col = 'horse_won_{}'.format(race)
            horse_placed_col = 'horse_placed_{}'.format(race)
            horse_showed_col = 'horse_showed_{}'.format(race)

            df[horse_won_col] = df[win_col] == df[horse_name_col]
            df[horse_placed_col] = df[place_col] == df[horse_name_col]
            df[horse_showed_col] = df[show_col] == df[horse_name_col]

            # Calculate point-in-time lifetime stats using the above metrics
            prev_lt_win_col = 'lifetime_wins_{}'.format(race-1)
            lt_win_col = 'lifetime_wins_{}'.format(race)

            prev_lt_place_col = 'lifetime_places_{}'.format(race-1)
            lt_place_col = 'lifetime_places_{}'.format(race)

            prev_lt_show_col = 'lifetime_shows_{}'.format(race-1)
            lt_show_col = 'lifetime_shows_{}'.format(race)

            # If horse won/placed/showed, decrement stats accordingly
            df.loc[df[horse_won_col] == True, lt_win_col] = df[prev_lt_win_col] - 1
            df.loc[df[horse_won_col] == False, lt_win_col] = df[prev_lt_win_col]

            df.loc[df[horse_placed_col] == True, lt_place_col] = df[prev_lt_place_col] - 1
            df.loc[df[horse_placed_col] == False, lt_place_col] = df[prev_lt_place_col]

            df.loc[df[horse_showed_col] == True, lt_show_col] = df[prev_lt_show_col] - 1
            df.loc[df[horse_showed_col] == False, lt_show_col] = df[prev_lt_show_col]
        
            # Append intermediate columns to drop list
            drop_columns += [horse_won_col, horse_placed_col, horse_showed_col, win_col, place_col, show_col, horse_name_col]
        else:
            drop_columns += [win_col, place_col, show_col, horse_name_col]
            
    # Drop unnecessary columns
    df = df.drop(drop_columns, axis=1)    
        
    return df

def clean_data(df, num_races=3):
    '''
        Clean the data of any outliers or incorrect values
    '''
    # Drop horses that didn't finish one of their previous races (time == 0)
    label_cols = ['label_{}'.format(race) for race in range(num_races)]
    for col in label_cols:
        df = df.loc[df[col] != 0]
        
    # Drop horses who have a negative lifetime stat (think these are a result of dqs)
    for race in range(num_races):
        show_col = 'lifetime_shows_{}'.format(race)
        place_col = 'lifetime_places_{}'.format(race)
        win_col = 'lifetime_wins_{}'.format(race)
        
        df = df.loc[df[show_col] >= 0]
        df = df.loc[df[place_col] >= 0]
        df = df.loc[df[win_col] >= 0]
        
    # Drop horses who have negative distance values for their races (bad data)
    distance_cols = ['distance_{}'.format(race) for race in range(num_races)]
    for col in distance_cols:
        df = df.loc[df[col] >= 0]
    
    return df
    
    
    

In [30]:
# Pre-process input files to generate master dataframe
#pd.set_option('display.max_columns', None) # Remove
past_races_included = 3
master_df = generate_master_df('./input_files/', num_races=past_races_included)
master_df.head(5)

print(master_df.min())

419it [00:39,  9.00it/s]


lifetime_shows_0            0.00
lifetime_shows_1            0.00
lifetime_shows_2            0.00
post_position_0             0.00
post_position_1             0.00
post_position_2             0.00
num_entrants_0              3.00
num_entrants_1              3.00
num_entrants_2              3.00
weight_0                  107.00
weight_1                  106.00
weight_2                  106.00
lifetime_wins_0             0.00
lifetime_wins_1             0.00
lifetime_wins_2             0.00
days_since_prev_race_0      1.00
days_since_prev_race_1      1.00
days_since_prev_race_2      3.00
lifetime_places_0           0.00
lifetime_places_1           0.00
lifetime_places_2           0.00
label_0                     6.57
label_1                     6.57
label_2                     6.57
horse_age_0                 4.00
horse_age_1                 4.00
horse_age_2                 4.00
distance_0                100.00
distance_1                100.00
distance_2                100.00
lifetime_s

In [31]:
# Generate data and labels for dataset
# Label column gets repeated as many times as there are races due to loading process
label_cols = ['label_{}'.format(race) for race in range(past_races_included)]
    
master_data = master_df.drop(label_cols, axis=1)
master_labels = master_df['label_0']

22688


# Create Dataloader

In [10]:
def batch_dataloader(data, labels, batch_size=10):
    # Only want full batches
    num_full_batches = data.shape[0] // batch_size
    total_entries = num_full_batches * batch_size
    data = data[:total_entries]
    labels = labels[:total_entries]
    
    for ii in range(0, total_entries, batch_size):
        # Get only data and labels needed for this batch
        batch_data = data[ii:ii+batch_size]
        batch_labels = labels[ii:ii+batch_size]
        
        yield batch_data, batch_labels
        
def dataloader(data, labels):
    for ii in range(len(data)):
        # Turn data and label into tensor
        yield torch.tensor(data.iloc[ii]), torch.tensor(labels.iloc[ii])

In [11]:
# Test dataloader
sample_data, sample_label = next(iter(dataloader(master_data, master_labels)))
print(sample_data, sample_label)

tensor([4.0000e+00, 4.0000e+00, 4.0000e+00, 8.0000e+00, 4.0000e+00, 3.0000e+00,
        9.0000e+00, 1.1000e+01, 8.0000e+00, 1.5000e+02, 1.4400e+02, 1.5000e+02,
        3.0000e+00, 3.0000e+00, 3.0000e+00, 4.3000e+01, 2.1000e+01, 1.4000e+01,
        2.0000e+00, 2.0000e+00, 2.0000e+00, 7.0000e+00, 7.0000e+00, 7.0000e+00,
        2.6400e+03, 3.9600e+03, 3.7400e+03, 2.9000e+01, 2.8000e+01, 2.7000e+01]) tensor(160.2200)


# Create Training/Test Datasets

In [44]:
# Drop all rows where label is 0

train_prop = 0.8 # Proportion of training data to testing data
train_end_idx = int(len(master_data) * 0.8)

train_data = master_data[:train_end_idx]
train_labels = master_labels[:train_end_idx]

test_data = master_data[train_end_idx:]
test_labels = master_labels[train_end_idx:]

# Standardize Test and Training Sets

In [45]:
# Will subtract mean of each column from each element in that column, then divide difference by std of column
train_data = (train_data.subtract(train_data.mean()))/train_data.std()
test_data = (test_data.subtract(test_data.mean()))/test_data.std()

print((np.mean(train_data), np.std(train_data)), (np.mean(test_data), np.std(test_data)))

(lifetime_shows_0          1.922368e-15
lifetime_shows_1          1.900482e-15
lifetime_shows_2          8.314378e-16
post_position_0           1.244979e-16
post_position_1          -1.907014e-16
post_position_2          -1.982742e-16
num_entrants_0           -4.606818e-16
num_entrants_1           -1.232904e-15
num_entrants_2           -2.707534e-15
weight_0                 -1.019692e-16
weight_1                  2.030209e-17
weight_2                 -1.561333e-15
lifetime_wins_0          -1.022457e-15
lifetime_wins_1           2.363472e-15
lifetime_wins_2           4.749136e-15
days_since_prev_race_0   -3.360887e-16
days_since_prev_race_1   -1.403530e-17
days_since_prev_race_2    3.791522e-17
lifetime_places_0         4.346447e-16
lifetime_places_1        -1.843423e-15
lifetime_places_2         1.822087e-15
horse_age_0               4.287455e-15
horse_age_1               4.287455e-15
horse_age_2               4.287455e-15
distance_0                6.249298e-16
distance_1              

# Define the Model

In [50]:
input_features = train_data.shape[1]
output_size = 1
drop_prob = 0.2

model = nn.Sequential(
    nn.Linear(input_features, 512),
    nn.ReLU(),
    nn.Dropout(drop_prob),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Dropout(drop_prob),
    nn.Linear(256, output_size)
)

print(model)

Sequential(
  (0): Linear(in_features=30, out_features=512, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2)
  (3): Linear(in_features=512, out_features=256, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2)
  (6): Linear(in_features=256, out_features=1, bias=True)
)


# Train Model

In [47]:
# Use GPU if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Define Training/Validation Loop
epochs = 2
lr = 0.003
print_every = 500
clip = 5

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

training_losses = [x for x in range(epochs)]
validation_losses = [x for x in range(epochs)]

for epoch in range(epochs):
    print('Starting Epoch {}'.format(epoch+1))
    entries_processed = 0
    
    # Begin Training Loop
    train_losses = []
    val_losses = []
    for data, label in dataloader(train_data, train_labels):
        entries_processed += 1
        
        # Set tensors to correct device
        data, label = data.to(device), label.to(device)
        
        # Zero out gradients
        model.zero_grad()
        
        # Perform pass through network
        train_out = model(data)
        
        # Calculate Loss and perform backprop -- clip gradients if necessary
        train_loss = criterion(train_out, label)
        train_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Log Loss
        train_losses.append(train_loss.item())
    
        # Take optimizer step
        optimizer.step()
        
        if entries_processed % print_every == 0:
            # Validation loop
            model.eval()
            for data, label in dataloader(test_data, test_labels):
                # Set tensors to correct device
                data, label = data.to(device), label.to(device)
                
                # Generate prediction
                val_out = model(data)
                
                # Calculate and log loss
                val_loss = criterion(val_out, label)
                val_losses.append(val_loss.item())
                
                # Set back to training mode
                model.train()
                
            # Print Metrics
            print(
                'Epoch: {}/{}...'.format(epoch+1, epochs),
                'Entries Processed: {}'.format(entries_processed),
                'Training Loss: {:.6f}...'.format(np.mean(train_losses)),
                'Validation Loss: {:.6f}...'.format(np.mean(val_losses))
                 )
            
        # Log Epoch-level metrics
        training_losses[epoch] = np.mean(train_losses)
        validation_losses[epoch] = np.mean(val_losses)

Starting Epoch 1


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch: 1/2... Entries Processed: 500 Training Loss: 1416.314964... Validation Loss: 330.915187...
Epoch: 1/2... Entries Processed: 1000 Training Loss: 842.478808... Validation Loss: 372.504630...
Epoch: 1/2... Entries Processed: 1500 Training Loss: 616.243961... Validation Loss: 329.914970...
Epoch: 1/2... Entries Processed: 2000 Training Loss: 517.455395... Validation Loss: 307.274836...
Epoch: 1/2... Entries Processed: 2500 Training Loss: 453.350680... Validation Loss: 329.357551...
Epoch: 1/2... Entries Processed: 3000 Training Loss: 395.453992... Validation Loss: 390.240710...
Epoch: 1/2... Entries Processed: 3500 Training Loss: 356.729074... Validation Loss: 379.116907...
Epoch: 1/2... Entries Processed: 4000 Training Loss: 324.779290... Validation Loss: 346.515146...
Epoch: 1/2... Entries Processed: 4500 Training Loss: 298.913446... Validation Loss: 339.747958...
Epoch: 1/2... Entries Processed: 5000 Training Loss: 284.645911... Validation Loss: 316.347209...
Epoch: 1/2... Entrie

KeyboardInterrupt: 

# Plot Performance

In [None]:
import matplotlib.pyplot as plt

# Plot training losses
plt.title('Training Losses')
plt.plot(training_losses)
plt.show()

# Plot Validation Losses
plt.title('Validation Losses')
plt.plot(validation_losses)
plt.show() 