# Part 4: Horse Race Prediction
## Regression Modelling
- In this section, we want to predict the finishing times of horses in a race, and then use it to predict the winner.
- We will use RMSE to evaluate, then after classification of the horse with the fastest time, find the accuracy of our prediction.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

import time
import joblib

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
# Read the train and test files
df_train = pd.read_csv(#uploadedfilelocation)
df_test = pd.read_csv(#uploadedfilelocation)
df_unseen = pd.read_csv(#uploadedfilelocation)

In [3]:
# View the shape of the train and test files
print(df_train.shape)
print(df_test.shape)
print(df_unseen.shape)

(21139, 27)
(5286, 27)
(2939, 27)


In [4]:
# View the first 2 rows of the train file
df_train.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,1.0,DOUBLE DRAGON,K019,B Prebble,D Cruz,133,1032,1,-,...,,2014-001,1,1.0,1400,1,1,1,6.058185,7.369681
1,2,2.0,PLAIN BLUE BANNER,S070,D Whyte,D E Ferraris,133,1075,13,2,...,,2014-001,2,2.0,1400,0,1,1,5.795734,6.721234


In [5]:
# View the first 2 rows of the test files
df_test.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,11.0,SUPER FLUKE,T382,J Moreira,D Cruz,118,1118,6,-,...,,2016-138,1/5/3/2/4/5,4.941176,1650,1,1,1,4.070707,8.052941
1,2,10.0,PEARL WARM WARM,T183,S Clipperton,P O'Sullivan,118,1099,12,SH,...,,2016-138,2/3/8/7/1/6,5.266667,1650,0,1,1,6.714859,6.858209


In [6]:
# View the first 2 rows of the unseen file
df_unseen.head(2)

Unnamed: 0,finishing_position,horse_number,horse_name,horse_id,jockey,trainer,actual_weight,declared_horse_weight,draw,length_behind_winner,...,running_position_6,race_id,recent_6_runs,recent_ave_rank,race_distance,HorseWin,HorseRankTop3,HorseRankTop50Percent,jockey_ave_rank,trainer_ave_rank
0,1,8.0,PRINCE HARMONY,V012,D Whyte,W Y So,119,1142,2,-,...,,2016-564,1/13/9/6/10,7.8,1200,1,1,1,5.795734,7.117705
1,2,2.0,GRACYDAD,V200,Z Purton,D J Hall,133,1092,7,SH,...,,2016-564,2/1/6/3/3/3,6.352941,1200,0,1,1,5.343498,6.639836


### Preprocessing of Train and Test Data

In [7]:
X_train = df_train[['actual_weight', 'declared_horse_weight',
                    'draw', #'win_odds',
                    'jockey_ave_rank','trainer_ave_rank',
                    'recent_ave_rank','race_distance']]

# Define the target
y_train = df_train['finish_time']

# Convert the target to seconds
y_train = y_train.apply(lambda x: x.split('.'))
y_train = y_train.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [8]:
y_train.head()

0    82.33
1    82.65
2    82.66
3    82.66
4    83.02
Name: finish_time, dtype: float64

In [9]:
# Define the testing set
X_test = df_test[['actual_weight', 'declared_horse_weight',
                    'draw', #'win_odds', 
                    'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance']]

In [10]:
# Define the target
y_test = df_test['finish_time']

# Convert the target to seconds
y_test = y_test.apply(lambda x: x.split('.'))
y_test = y_test.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [11]:
# Define the unseen set
X_unseen = df_unseen[['actual_weight', 'declared_horse_weight',
                    'draw', #'win_odds',
                    'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance']]

### Define functions to run and evaluate models

In [12]:
# This function finds the accuracy of the model for predicting the Top and Top 3 finishers
def find_prob(y_pred):
    
    i=0
    count_top_winners = 0
    count_top_correct = 0

    count_top3_winners = 0
    count_top3_correct = 0

    for column in ['HorseWin', 'HorseRankTop3']:
            
        for race in df_test['race_id'].unique():
            
            # Create temp dataframe
            temp = df_test[df_test['race_id']==race]

            # Get the index of the temp dataframe
            temp_index = temp.index

            # Find the index of the winners from the temp dataframe
            if i == 0:
                winners_index = temp[temp['finishing_position']==1].index
            else:
                winners_index = temp[temp['finishing_position']<=3].index

            # Create a temp dataframe for the predicted probabilities
            temp_pred = y_pred.iloc[temp_index]

            # Sort the temp dataframe by the predicted timings
            temp_pred = temp_pred.sort_values(by=temp_pred.columns[0])

            # Get the index of the winners from the temp pred dataframe
            if i == 0:
                winners_pred_index = temp_pred[:1].index
            else:
                winners_pred_index = temp_pred[:3].index

            # Count the number of winners and correct predictions
            if i == 0:
                count_top_winners += len(winners_index)
                count_top_correct += len(set(winners_index).intersection(set(winners_pred_index)))
            else:
                count_top3_winners += len(winners_index)
                count_top3_correct += len(set(winners_index).intersection(set(winners_pred_index)))
        i+=1
    
    # Calculate the accuracy
    top_accuracy = round(count_top_correct/count_top_winners, 3)
    top3_accuracy = round(count_top3_correct/count_top3_winners, 3)

    return top_accuracy, top3_accuracy

In [13]:
# Create a dataframe to store the results
results = pd.DataFrame(columns=['Model', 'RMSE_train', 'RMSE_test', 
                                'Generalization', 'Top1_Train_Accuracy', 'Top1_Test_Accuracy',
                                'Top3_Train_Accuracy', 'Top3_Test_Accuracy'])

In [14]:
# Define function to run the model
def run_model(model, X_train, y_train, X_test, y_test, X_unseen):

        # Store model name
        model_name = model.__class__.__name__

        # Scale the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_unseen = scaler.transform(X_unseen)

        # Fit the model         
        model.fit(X_train, y_train)
        
        # Predict on the training set
        y_train_pred = model.predict(X_train)
        y_train_pred = pd.DataFrame(y_train_pred)

        # Predict on the testing set
        y_test_pred = model.predict(X_test)
        y_test_pred = pd.DataFrame(y_test_pred)

        # Calculate the RMSE
        train_rmse = round(math.sqrt(mean_squared_error(y_train, y_train_pred)), 3)
        test_rmse = round(math.sqrt(mean_squared_error(y_test, y_test_pred)), 3)
        
        # Calculate the accuracy
        train_accuracy, train_accuracy_top3 = find_prob(y_train_pred)
        test_accuracy, test_accuracy_top3 = find_prob(y_test_pred)

        # Calculate generalization error percentage
        generalization_error = round((test_rmse - train_rmse)/train_rmse*100, 3)

        # Print the results
        print('Model results for', model_name, ':')
        print('Train RMSE: ', train_rmse)
        print('Test RMSE: ', test_rmse)
        print('Generalization Error: ', generalization_error, '%', '\n')

        print('Train Accuracy for finding Top position: ', train_accuracy)
        print('Test Accuracy for finding Top position: ', test_accuracy, '\n')

        print('Train Accuracy for finding Top 3 positions: ', train_accuracy_top3)
        print('Test Accuracy for finding Top 3 positions: ', test_accuracy_top3)

        # Append the results to the dataframe
        results.loc[len(results)] = [model_name, train_rmse, test_rmse, generalization_error,
                                  train_accuracy, test_accuracy, train_accuracy_top3, test_accuracy_top3]
        
        # predict on unseen data
        y_unseen_pred = model.predict(X_unseen)
        y_unseen_pred = pd.DataFrame(y_unseen_pred)

        return y_unseen_pred
        


### Model 1: Ridge Regression

In [15]:
# Run the model
ridge = Ridge(alpha=2600)

In [16]:
# Run the model
ridge_pred = run_model(ridge, X_train, y_train, X_test, y_test, X_unseen)

Model results for Ridge :
Train RMSE:  2.425
Test RMSE:  2.482
Generalization Error:  2.351 % 

Train Accuracy for finding Top position:  0.165
Test Accuracy for finding Top position:  0.245 

Train Accuracy for finding Top 3 positions:  0.375
Test Accuracy for finding Top 3 positions:  0.403


### Model 2: K-Nearest Neighbors Regressor

In [17]:
# KNN
knn = KNeighborsRegressor(n_neighbors=500)

knn_pred = run_model(knn, X_train, y_train, X_test, y_test, X_unseen)


Model results for KNeighborsRegressor :
Train RMSE:  6.171
Test RMSE:  6.666
Generalization Error:  8.021 % 

Train Accuracy for finding Top position:  0.156
Test Accuracy for finding Top position:  0.132 

Train Accuracy for finding Top 3 positions:  0.331
Test Accuracy for finding Top 3 positions:  0.304


### Model 3: Random Forest Regressor

In [18]:
# Run the model
rf = RandomForestRegressor(n_estimators=30, max_depth=4, random_state=42, max_features=5,
                            min_samples_split=20, min_samples_leaf=200, n_jobs=-1)

rf_pred = run_model(rf, X_train, y_train, X_test, y_test, X_unseen)

Model results for RandomForestRegressor :
Train RMSE:  1.993
Test RMSE:  2.056
Generalization Error:  3.161 % 

Train Accuracy for finding Top position:  0.191
Test Accuracy for finding Top position:  0.224 

Train Accuracy for finding Top 3 positions:  0.364
Test Accuracy for finding Top 3 positions:  0.391


### Model 4: Light Gradient Boosting Machine (LightGBM)

In [19]:
# Run the model
lgbm = LGBMRegressor(n_estimators=20, max_depth=5, random_state=42, num_leaves=100,
                     min_child_samples=10, min_child_weight=10, n_jobs=-1)

lgbm_pred = run_model(lgbm, X_train, y_train, X_test, y_test, X_unseen)

Model results for LGBMRegressor :
Train RMSE:  2.518
Test RMSE:  2.573
Generalization Error:  2.184 % 

Train Accuracy for finding Top position:  0.219
Test Accuracy for finding Top position:  0.285 

Train Accuracy for finding Top 3 positions:  0.415
Test Accuracy for finding Top 3 positions:  0.489


### View results of our 4 regression models

In [20]:
# View the results
results

Unnamed: 0,Model,RMSE_train,RMSE_test,Generalization,Top1_Train_Accuracy,Top1_Test_Accuracy,Top3_Train_Accuracy,Top3_Test_Accuracy
0,Ridge,2.425,2.482,2.351,0.165,0.245,0.375,0.403
1,KNeighborsRegressor,6.171,6.666,8.021,0.156,0.132,0.331,0.304
2,RandomForestRegressor,1.993,2.056,3.161,0.191,0.224,0.364,0.391
3,LGBMRegressor,2.518,2.573,2.184,0.219,0.285,0.415,0.489


Since our objective is to have low RMSE, good generalisation, and good training accuracy, the LGBMRegressor meets all the criteria and we will choose it as our final model for backtesting.

In [21]:
# Save predictions
ridge_pred.to_csv('./predictions/ridge_pred.csv')
knn_pred.to_csv('./predictions/knn_pred.csv')
rf_pred.to_csv('./predictions/rf_pred.csv')
lgbm_pred.to_csv('./predictions/lgbm_pred.csv')

In [22]:
# Save the model
joblib.dump(lgbm, './model/lgbm_model.pkl')

['./model/lgbm_model.pkl']

In [23]:
# Save the results
results.to_csv('./results/reg_results.csv')