In [3]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # splits data into training and testing sets
from sklearn.metrics import accuracy_score # evaluates how well the model predicts wins

data = pd.read_csv(r"..\features\features_encoded.csv", parse_dates=['Date'])
data

Unnamed: 0,Date,Track,Race Number,Horse,Barrier,Jockey,Trainer,SP,Finish Position,Race Distance,...,SP_missing,JockeyWinRate,TrainerWinRate,HorseWinRate,Jockey_ID,Trainer_ID,Horse_ID,Jockey_Enc,Trainer_Enc,Horse_Enc
0,2023-01-01,PORT MACQUARIE,1,COLLIDED,10,MATTHEW BENNETT,SALLY TAYLOR,18.0,5.0,1512,...,0,0.060606,0.068182,0.000000,818,3546,7847,0.060606,0.068182,0.000000
1,2023-01-01,PORT MACQUARIE,1,GOLD MERCHANT,9,ASHLEY MORGAN,PAUL SHAILER,5.5,5.0,1512,...,0,0.113247,0.099644,0.045455,85,3076,15368,0.113247,0.099644,0.045455
2,2023-01-01,PORT MACQUARIE,1,PARTY STOP,7,PETER GRAHAM,STEPHEN FARLEY,11.0,5.0,1512,...,0,0.027505,0.025641,0.000000,957,3737,30037,0.027505,0.025641,0.000000
3,2023-01-01,PORT MACQUARIE,1,SUKHBIR,3,AARON BULLOCK,KRIS LEES,2.6,1.0,1512,...,0,0.173797,0.087165,0.104167,5,2129,39694,0.173797,0.087165,0.104167
4,2023-01-01,PORT MACQUARIE,1,EL CABALLO,1,GRANT BUCKLEY,MATTHEW ROBINSON,6.5,3.0,1512,...,0,0.068706,0.048246,0.000000,421,2555,11416,0.068706,0.048246,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442645,2025-10-20,MOUNT ISA,7,ROMAN DAGGER,7,SHANE MCGOVERN,KERRY KROGH,101.0,5.0,1450,...,0,0.013514,0.016393,0.181818,1106,2070,33949,0.013514,0.016393,0.181818
442646,2025-10-20,MOUNT ISA,7,INFINITE PRINCE,8,DENICIOUS SMITH,EMMA MORTON,5.5,4.0,1450,...,0,0.012552,0.052632,0.062500,329,1089,18774,0.012552,0.052632,0.062500
442647,2025-10-20,MOUNT ISA,7,MAGIC GAME,6,CHRIS WHITELEY,TANYA PARRY,23.0,5.0,1450,...,0,0.088586,0.079681,0.083333,233,3883,23860,0.088586,0.079681,0.083333
442648,2025-10-20,MOUNT ISA,7,MEGHAN,3,MADDIE MANKELOW,TANYA PARRY,3.5,2.0,1450,...,0,0.127660,0.079681,0.030303,793,3883,25003,0.127660,0.079681,0.030303


# Setup the Features

## Preprocess the Features

In [4]:
# Features - List of columns used as inputs to predict Win
features = [
    'Barrier', 'SP', 'Race Distance', 'Class', 'Track Condition', 'Weather',
    'Total Runners', 'JockeyWinRate', 'TrainerWinRate', 'Jockey_ID', 'Trainer_ID',
    'Horse_ID', 'HorseWinRate'
]

# Target - column trying to predict (0 or 1)
target = 'Win'

x = data[features] # extracts features for the model
y = data[target] # extracts target labels

# Tells LightGBM which columns are categorical (non-numeric). LightGBM can handle these natively without one-hot encoding
categorical_features = ['Class', 'Track Condition', 'Weather', 'Jockey_ID', 'Trainer_ID', 'Horse_ID']
for col in categorical_features:
    x[col] = x[col].astype('category')

print(x)

        Barrier     SP  Race Distance   Class Track Condition Weather  \
0            10   18.0           1512  MAIDEN           GOOD4   OCAST   
1             9    5.5           1512  MAIDEN           GOOD4   OCAST   
2             7   11.0           1512  MAIDEN           GOOD4   OCAST   
3             3    2.6           1512  MAIDEN           GOOD4   OCAST   
4             1    6.5           1512  MAIDEN           GOOD4   OCAST   
...         ...    ...            ...     ...             ...     ...   
442645        7  101.0           1450    BM65            GOOD    FINE   
442646        8    5.5           1450    BM65            GOOD    FINE   
442647        6   23.0           1450    BM65            GOOD    FINE   
442648        3    3.5           1450    BM65            GOOD    FINE   
442649        5   34.0           1450    BM65            GOOD    FINE   

        Total Runners  JockeyWinRate  TrainerWinRate Jockey_ID Trainer_ID  \
0                  11       0.060606        0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[col] = x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[col] = x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[col] = x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

## Split the Data

In [5]:
# train_test_split - Splits the data into training set (80%) and test set (20%).
# random_state=42 - ensures the split is reproducible
# stratify=y - keeps the same proportion of winners and non-winners in both sets (dataset is imbalanced).
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

## Train the Model

In [6]:
# Converts data into LightGBM’s optimized format
lgb_train = lgb.Dataset(x_train, label=y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(x_test, label=y_test, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'feature_fraction': 0.9,  # each tree sees 90% of features randomly
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_train, lgb_eval],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
# model.save_model('poc_model.txt', num_iteration=model.best_iteration)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.163253	valid_1's binary_logloss: 0.180516


# Save model

In [7]:
import joblib

joblib.dump(model, "model_with_sp.pkl")

['model_with_sp.pkl']

## Make the Predictions

In [8]:
y_pred_prob = model.predict(x_test, num_iteration=model.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int) # converts probabilities to 0 or 1 (threshold = 0.5)

accuracy = accuracy_score(y_test, y_pred) # compares predictions with actual results to calculate accuracy
print("Accuracy:", np.floor(accuracy*100), "%")

Accuracy: 92.0 %


## Simulate the Betting ROI

In [9]:
# Copy test data and include relevant info
test_results = x_test.copy()
test_results['Win'] = y_test.values
test_results['Predicted'] = y_pred
test_results['SP'] = data.loc[x_test.index, 'SP']      # odds
test_results['Track'] = data.loc[x_test.index, 'Track']  # track
test_results['Horse'] = data.loc[x_test.index, 'Horse']  # horse name
test_results['Jockey'] = data.loc[x_test.index, 'Jockey']  # Jockey name
test_results['Trainer'] = data.loc[x_test.index, 'Trainer']  # Trainer name
test_results['RaceNumber'] = data.loc[x_test.index, 'Race Number']  # race number
test_results['Date'] = data.loc[x_test.index, 'Date']    # race date

standard_bet_size = 10

# Flat $10 bet only on predicted winners
test_results['Bet'] = test_results['Predicted'] * standard_bet_size
test_results['Payout'] = test_results['Bet'] * (test_results['Win'] * test_results['SP'])
test_results['Profit'] = test_results['Payout'] - test_results['Bet']

# Keep only races where a bet was made
bet_summary_flat = test_results[test_results['Predicted'] == 1][[
    'Date', 'Track', 'RaceNumber', 'Horse', 'Jockey', 'Trainer', 'SP', 'Bet', 'Profit',
]]

# Sort by date and race for clarity
bet_summary_flat = bet_summary_flat.sort_values(['Date', 'RaceNumber'])

bet_summary_flat.to_csv("./results/bet_summary_flat.csv", sep=",", index=False)

# Display
bet_summary_flat

Unnamed: 0,Date,Track,RaceNumber,Horse,Jockey,Trainer,SP,Bet,Profit
700,2023-01-01,SUNSHINE COAST,2,ATLANTIC EAGLE,KYLE WILSON-TAYLOR,PAUL DUNCAN,2.90,10,19.0
144,2023-01-01,INVERELL,6,PATENTED,EMILY LANG,PAT WEBSTER,3.90,10,-10.0
1134,2023-01-02,TERANG,4,MUSGRAVE,JOHN ALLEN,C MAHER & D EUSTACE,2.00,10,-10.0
1420,2023-01-02,ESPERANCE,7,KORANIS,ANGELA JOHNSTON,CAREY MARTIN,5.00,10,40.0
1445,2023-01-03,TAREE,3,HANDSOME,DYLAN GIBBONS,KRIS LEES,1.35,10,3.5
...,...,...,...,...,...,...,...,...,...
440378,2025-10-18,RANDWICK,7,KA YING RISING,ZAC PURTON,DAVID A HAYES,2.00,10,10.0
442231,2025-10-19,SUNSHINE COAST,1,BOOMELLI,BEN THOMPSON,TONY GOLLAN,1.20,10,2.0
441980,2025-10-19,MOE,5,VAN GERWEN,JASON MASKIELL,PETER GELAGOTIS,2.90,10,-10.0
442283,2025-10-19,SUNSHINE COAST,6,ITALIAN RIVIERA,JAG GUTHMANN-CHEST,STUART KENDRICK,3.80,10,-10.0


### With Variable bet size

In [10]:
# Variable bet size
max_bet = 100

# Compute payout and profit
test_results['Bet'] = y_pred_prob * max_bet  # higher probability → bigger bet
test_results['Payout'] = test_results['Bet'] * (test_results['Win'] * test_results['SP'])
test_results['Profit'] = np.floor(test_results['Payout'] - test_results['Bet'])

# Keep only races where a bet was made
bet_summary_variable = test_results[test_results['Predicted'] == 1][[
    'Date', 'Track', 'RaceNumber', 'Horse', 'Jockey', 'Trainer', 'SP', 'Bet', 'Profit',
]]

# Sort by date and race for clarity
bet_summary_variable = bet_summary_variable.sort_values(['Date', 'RaceNumber'])

bet_summary_variable.to_csv("./results/bet_summary_variable.csv", sep=",", index=False)

# Display
bet_summary_variable



Unnamed: 0,Date,Track,RaceNumber,Horse,Jockey,Trainer,SP,Bet,Profit
700,2023-01-01,SUNSHINE COAST,2,ATLANTIC EAGLE,KYLE WILSON-TAYLOR,PAUL DUNCAN,2.90,64.048334,121.0
144,2023-01-01,INVERELL,6,PATENTED,EMILY LANG,PAT WEBSTER,3.90,58.781316,-59.0
1134,2023-01-02,TERANG,4,MUSGRAVE,JOHN ALLEN,C MAHER & D EUSTACE,2.00,59.740573,-60.0
1420,2023-01-02,ESPERANCE,7,KORANIS,ANGELA JOHNSTON,CAREY MARTIN,5.00,57.043970,228.0
1445,2023-01-03,TAREE,3,HANDSOME,DYLAN GIBBONS,KRIS LEES,1.35,59.275739,20.0
...,...,...,...,...,...,...,...,...,...
440378,2025-10-18,RANDWICK,7,KA YING RISING,ZAC PURTON,DAVID A HAYES,2.00,66.097726,66.0
442231,2025-10-19,SUNSHINE COAST,1,BOOMELLI,BEN THOMPSON,TONY GOLLAN,1.20,91.869559,18.0
441980,2025-10-19,MOE,5,VAN GERWEN,JASON MASKIELL,PETER GELAGOTIS,2.90,53.804063,-54.0
442283,2025-10-19,SUNSHINE COAST,6,ITALIAN RIVIERA,JAG GUTHMANN-CHEST,STUART KENDRICK,3.80,52.892158,-53.0
