In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../raw_data/horse_racing_raw.csv")
data.head(2)

Unnamed: 0,id,f_id,f_ko,f_track,f_going,f_racetype,f_horse,f_jockey,f_trainer,f_distance,...,f_pm_05m,f_pm_03m,f_pm_02m,f_pm_01m,f_bsp_p_back,f_bsp_p_lay,f_pm_01m_p_back,f_pm_01m_p_lay,f_pm_15m_p_back,f_pm_15m_p_lay
0,16916847000316,16916847000002,2023-08-10 17:25:00,YARMOUTH,GD,Other Handicap,Porfin,Molly Presland,Phil McEntee,7.0,...,9.0,10.0,9.8,7.6,-1.0,0.95,-1.0,0.95,-1.0,0.95
1,16916953800243,16916953800013,2023-08-10 20:23:00,SALISBURY,GD,Other Handicap,Manxman,Robert Havlin,Simon & Ed Crisford,14.0,...,2.02,1.96,2.04,2.0,0.96,-1.01,0.95,-1.02,1.06,-1.14


In [3]:
irish_tracks = [
    "SLIGO", "LIMERICK", "NAVAN", "WEXFORD", "CURRAGH",
    "GALWAY", "KILBEGGAN", "GOWRAN PARK", "BELLEWSTOWN",
    "LISTOWEL", "THURLES", "BALLINROBE", "TRAMORE",
    "LEOPARDSTOWN", "DOWN ROYAL", "ROSCOMMON", "CORK",
    "DUNDALK", "KILLARNEY", "LAYTOWN", "TIPPERARY",
    "FAIRYHOUSE", "NAAS", "DOWNPATRICK", "CLONMEL",
    "PUNCHESTOWN"
]

In [4]:
data['country'] = data['f_track'].apply(lambda x: 'IRE' if x in irish_tracks else 'GB')

In [5]:
# Assuming you have a DataFrame named 'data'

# Calculate mean ratings for each 'f_id' group
mean_ratings_by_id = data.groupby('f_id')['f_rating_or'].mean()

# Define the mapping of mean ratings to f_class values
rating_to_f_class_mapping = {
    (96, float('inf')): 1,
    (86, 96): 2,
    (76, 86): 3,
    (66, 76): 4,
    (56, 66): 5,
    (46, 56): 6,
    (-float('inf'), 46): 7
}

# Function to map mean ratings to f_class values
def map_rating_to_f_class(mean_rating):
    for rating_range, f_class_value in rating_to_f_class_mapping.items():
        if rating_range[0] <= mean_rating <= rating_range[1]:
            return f_class_value

# Apply the mapping to fill NULL values in 'f_class' column based on mean ratings
data['f_class'] = data.apply(lambda row: map_rating_to_f_class(mean_ratings_by_id.get(row['f_id'])), axis=1)

# Now the 'f_class' column should be filled based on the specified mapping using mean ratings


In [6]:
# Merge the mean ratings back into the original DataFrame based on 'f_id'
data = data.merge(mean_ratings_by_id, how='left', left_on='f_id', right_index=True)

# Rename the merged mean rating column for clarity
data.rename(columns={'f_rating_or_y': 'mean_f_rating_or', 'f_rating_or_x' : 'f_rating_or' }, inplace=True)

In [7]:
data[['f_track', 'f_class', 'country', 'f_id', 'f_rating_or', 'mean_f_rating_or']].head(10)

Unnamed: 0,f_track,f_class,country,f_id,f_rating_or,mean_f_rating_or
0,YARMOUTH,5,GB,16916847000002,64.0,56.888889
1,SALISBURY,5,GB,16916953800013,62.0,62.272727
2,YARMOUTH,6,GB,16916829000014,53.0,51.333333
3,CHEPSTOW,3,GB,16916940000027,79.0,82.375
4,SLIGO,1,IRE,16916835000029,80.0,109.0
5,SANDOWN,4,GB,16916949000036,74.0,70.166667
6,BRIGHTON,6,GB,16916814000040,43.0,47.2
7,ROSCOMMON,3,IRE,16915194000000,95.0,85.692308
8,RIPON,6,GB,16915218000001,54.0,55.285714
9,ROSCOMMON,3,IRE,16915212000011,85.0,80.928571


In [8]:
data['or_rating_vs_avg'] = data['f_rating_or'] - data['mean_f_rating_or']

In [9]:
data[['f_track', 'f_class', 'country', 'f_id', 'f_rating_or', 'mean_f_rating_or', 'or_rating_vs_avg']].head(10)

Unnamed: 0,f_track,f_class,country,f_id,f_rating_or,mean_f_rating_or,or_rating_vs_avg
0,YARMOUTH,5,GB,16916847000002,64.0,56.888889,7.111111
1,SALISBURY,5,GB,16916953800013,62.0,62.272727,-0.272727
2,YARMOUTH,6,GB,16916829000014,53.0,51.333333,1.666667
3,CHEPSTOW,3,GB,16916940000027,79.0,82.375,-3.375
4,SLIGO,1,IRE,16916835000029,80.0,109.0,-29.0
5,SANDOWN,4,GB,16916949000036,74.0,70.166667,3.833333
6,BRIGHTON,6,GB,16916814000040,43.0,47.2,-4.2
7,ROSCOMMON,3,IRE,16915194000000,95.0,85.692308,9.307692
8,RIPON,6,GB,16915218000001,54.0,55.285714,-1.285714
9,ROSCOMMON,3,IRE,16915212000011,85.0,80.928571,4.071429


In [18]:
pd.set_option('display.max_columns', None)

In [12]:
data['15m_odds_prob'] = 1 / data['f_pm_15m']
data['5m_odds_prob'] = 1 / data['f_pm_05m']

In [14]:
data['15to5m_odds_move_perc'] = (data['5m_odds_prob'] / data['15m_odds_prob'] - 1)

In [15]:
data['15to5m_odds_move_raw'] = (data['5m_odds_prob'] - data['15m_odds_prob'])

In [17]:
# Convert 'f_ko' column to datetime
data['f_ko'] = pd.to_datetime(data['f_ko'])

# Sort the DataFrame by 'f_ko' in ascending order
data.sort_values(by='f_ko', inplace=True)

# Calculate 'prev_jockey_runs' column
data['prev_jockey_runs'] = (
    data.groupby('f_jockey').cumcount()
)

# Calculate 'prev_horse_runs' column
data['prev_horse_runs'] = (
    data.groupby('f_horse').cumcount()
)

# Calculate 'prev_trainer_runs' column
data['prev_trainer_runs'] = (
    data.groupby('f_trainer').cumcount()
)

# Reset index after sorting and groupby operations
data.reset_index(drop=True, inplace=True)

In [19]:
data.head()

Unnamed: 0,id,f_id,f_ko,f_track,f_going,f_racetype,f_horse,f_jockey,f_trainer,f_distance,f_class,f_age,f_pace,f_weight,f_runners,pred_isp,f_rating_rbd,f_rating_or,id.1,f_id.1,f_place,f_bsp,f_ip_min,f_ip_max,f_pm_15m,f_pm_10m,f_pm_05m,f_pm_03m,f_pm_02m,f_pm_01m,f_bsp_p_back,f_bsp_p_lay,f_pm_01m_p_back,f_pm_01m_p_lay,f_pm_15m_p_back,f_pm_15m_p_lay,country,mean_f_rating_or,or_rating_vs_avg,15m_odds_prob,5m_odds_prob,15to5m_odds_move_perc,15to5m_odds_move_raw,prev_jockey_runs,prev_horse_runs,prev_trainer_runs
0,16042788000037,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Baseman,Oisin Orr,D K Weld,7.0,7,2,7.0,132,18,4.0,111.0,0.0,16042788000037,16042788000007,15,3.2,3.05,1000.0,3.2,3.1,3.3,3.45,3.65,3.3,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.3125,0.30303,-0.030303,-0.00947,0.0,0,0.0
1,16042788000098,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Diamil,Shane Foley,M Halford,7.0,7,2,0.0,132,18,26.0,7.0,0.0,16042788000098,16042788000007,9,77.79,50.0,1000.0,75.0,90.0,100.0,90.0,90.0,90.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.013333,0.01,-0.25,-0.003333,0.0,0,0.0
2,16042788000265,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Many Words,Robbie Colgan,Ms Sheila Lavery,7.0,7,2,0.0,132,18,51.0,75.0,0.0,16042788000265,16042788000007,12,390.95,100.0,1000.0,250.0,490.0,480.0,420.0,360.0,360.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.004,0.002083,-0.479167,-0.001917,0.0,0,0.0
3,16042788000225,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Karpen,Ben Coen,Timothy Doyle,7.0,7,2,0.0,132,18,201.0,7.0,0.0,16042788000225,16042788000007,16,742.14,340.0,1000.0,320.0,340.0,390.0,330.0,330.0,360.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.003125,0.002564,-0.179487,-0.000561,0.0,0,0.0
4,16042788000305,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,New Reality,Kevin Manning,J S Bolger,7.0,7,2,0.0,132,18,11.0,7.0,0.0,16042788000305,16042788000007,10,51.33,38.0,1000.0,50.0,80.0,90.0,85.0,80.0,55.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.02,0.011111,-0.444444,-0.008889,0.0,0,0.0


In [24]:
data['prev_jockey_wins'] = (
    data[data['f_place']==1.0].groupby('f_jockey').cumcount())

In [30]:
data[data['f_place']==1.0].groupby('f_jockey').cumcount()

7           0.0
23          0.0
29          0.0
38          0.0
60          0.0
          ...  
345305     39.0
345327     85.0
345333     47.0
345349     24.0
345356    185.0
Length: 36648, dtype: float64

In [28]:
data.tail(20)

Unnamed: 0,id,f_id,f_ko,f_track,f_going,f_racetype,f_horse,f_jockey,f_trainer,f_distance,f_class,f_age,f_pace,f_weight,f_runners,pred_isp,f_rating_rbd,f_rating_or,id.1,f_id.1,f_place,f_bsp,f_ip_min,f_ip_max,f_pm_15m,f_pm_10m,f_pm_05m,f_pm_03m,f_pm_02m,f_pm_01m,f_bsp_p_back,f_bsp_p_lay,f_pm_01m_p_back,f_pm_01m_p_lay,f_pm_15m_p_back,f_pm_15m_p_lay,country,mean_f_rating_or,or_rating_vs_avg,15m_odds_prob,5m_odds_prob,15to5m_odds_move_perc,15to5m_odds_move_raw,prev_jockey_runs,prev_horse_runs,prev_trainer_runs,prev_jockey_wins
345343,16925517000084,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Chips And Rice,Daniel Muscutt,James Fanshawe,9.0,3,4,7.0,135,13,11.0,104.0,78.0,16925517000084,16925517000003,9,13.61,11.5,1000.0,8.6,9.0,9.8,9.4,10.0,11.5,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-0.8,0.116279,0.102041,-0.122449,-0.014238,2078.0,12,656.0,
345344,16925517000258,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Miss Down Under,Rob Hornby,Amanda Perrett,9.0,3,4,9.0,132,13,26.0,102.0,75.0,16925517000258,16925517000003,8,19.25,12.0,1000.0,28.0,29.0,34.0,30.0,28.0,23.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-3.8,0.035714,0.029412,-0.176471,-0.006303,1693.0,11,398.0,
345345,16925517000439,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Zarga,Richard Kingscote,Sir Michael Stoute,9.0,3,3,3.0,134,13,3.25,121.0,84.0,16925517000439,16925517000003,4,2.81,2.0,1000.0,2.96,2.78,2.8,2.9,2.8,2.86,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,5.2,0.337838,0.357143,0.057143,0.019305,2198.0,6,733.0,
345346,16925517000103,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Crystal Casque,Oliver Searle,Rod Millman,9.0,3,8,3.0,140,13,8.0,111.0,82.0,16925517000103,16925517000003,7,17.02,10.5,1000.0,13.0,14.0,12.5,14.0,16.0,15.5,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,3.2,0.076923,0.08,0.04,0.003077,81.0,29,876.0,
345347,16925517000417,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Victoria Grove,Jefferson Smith,Henry Spiller,9.0,3,4,5.0,131,13,34.0,102.0,74.0,16925517000417,16925517000003,10,110.0,100.0,1000.0,85.0,85.0,90.0,75.0,90.0,95.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-4.8,0.011765,0.011111,-0.055556,-0.000654,171.0,13,408.0,
345348,16925517000388,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Tango Tonight,Oisin Murphy,Hughie Morrison,9.0,3,4,6.0,131,13,17.0,101.0,74.0,16925517000388,16925517000003,2,9.26,1.66,210.0,12.5,12.5,12.0,10.0,9.6,9.2,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-4.8,0.08,0.083333,0.041667,0.003333,1509.0,8,799.0,
345349,16925517000240,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Marinara,Kaiya Fraser,Henry Spiller,9.0,3,3,6.0,133,13,5.0,110.0,83.0,16925517000240,16925517000003,1,5.3,1.01,16.0,4.9,5.1,5.1,5.5,5.9,5.7,4.09,-4.3,4.47,-4.8,3.71,-4.0,GB,78.8,4.2,0.204082,0.196078,-0.039216,-0.008003,141.0,5,409.0,24.0
345350,16925517000005,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Aiming High,Hayley Turner,David Simcock,9.0,3,4,7.0,135,13,21.0,105.0,78.0,16925517000005,16925517000003,6,14.87,4.3,1000.0,15.0,14.0,14.0,12.5,11.5,11.5,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-0.8,0.066667,0.071429,0.071429,0.004762,1081.0,13,920.0,
345351,16925517000370,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Sly Madam,Trevor Whelan,Sheena West,9.0,3,5,2.0,140,13,13.0,106.0,82.0,16925517000370,16925517000003,5,22.0,20.0,1000.0,17.5,19.5,19.5,18.0,19.0,20.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,3.2,0.057143,0.051282,-0.102564,-0.005861,1065.0,34,278.0,
345352,16925517000396,16925517000003,2023-08-20 18:15:00,SANDOWN,GD,Other Handicap,Thebeautifulgame,Saffie Osborne,Tom Clover,9.0,3,4,5.0,135,13,26.0,99.0,78.0,16925517000396,16925517000003,3,26.0,20.0,1000.0,50.0,48.0,48.0,40.0,40.0,38.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,78.8,-0.8,0.02,0.020833,0.041667,0.000833,1171.0,9,592.0,


In [31]:
data['win'] = data['f_place'].apply(lambda x: 1 if x ==1 else 0)

In [32]:
data.head(20)

Unnamed: 0,id,f_id,f_ko,f_track,f_going,f_racetype,f_horse,f_jockey,f_trainer,f_distance,f_class,f_age,f_pace,f_weight,f_runners,pred_isp,f_rating_rbd,f_rating_or,id.1,f_id.1,f_place,f_bsp,f_ip_min,f_ip_max,f_pm_15m,f_pm_10m,f_pm_05m,f_pm_03m,f_pm_02m,f_pm_01m,f_bsp_p_back,f_bsp_p_lay,f_pm_01m_p_back,f_pm_01m_p_lay,f_pm_15m_p_back,f_pm_15m_p_lay,country,mean_f_rating_or,or_rating_vs_avg,15m_odds_prob,5m_odds_prob,15to5m_odds_move_perc,15to5m_odds_move_raw,prev_jockey_runs,prev_horse_runs,prev_trainer_runs,prev_jockey_wins,win
0,16042788000037,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Baseman,Oisin Orr,D K Weld,7.0,7,2,7.0,132,18,4.0,111.0,0.0,16042788000037,16042788000007,15,3.2,3.05,1000.0,3.2,3.1,3.3,3.45,3.65,3.3,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.3125,0.30303,-0.030303,-0.00947,0.0,0,0.0,,0
1,16042788000098,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Diamil,Shane Foley,M Halford,7.0,7,2,0.0,132,18,26.0,7.0,0.0,16042788000098,16042788000007,9,77.79,50.0,1000.0,75.0,90.0,100.0,90.0,90.0,90.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.013333,0.01,-0.25,-0.003333,0.0,0,0.0,,0
2,16042788000265,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Many Words,Robbie Colgan,Ms Sheila Lavery,7.0,7,2,0.0,132,18,51.0,75.0,0.0,16042788000265,16042788000007,12,390.95,100.0,1000.0,250.0,490.0,480.0,420.0,360.0,360.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.004,0.002083,-0.479167,-0.001917,0.0,0,0.0,,0
3,16042788000225,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Karpen,Ben Coen,Timothy Doyle,7.0,7,2,0.0,132,18,201.0,7.0,0.0,16042788000225,16042788000007,16,742.14,340.0,1000.0,320.0,340.0,390.0,330.0,330.0,360.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.003125,0.002564,-0.179487,-0.000561,0.0,0,0.0,,0
4,16042788000305,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,New Reality,Kevin Manning,J S Bolger,7.0,7,2,0.0,132,18,11.0,7.0,0.0,16042788000305,16042788000007,10,51.33,38.0,1000.0,50.0,80.0,90.0,85.0,80.0,55.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.02,0.011111,-0.444444,-0.008889,0.0,0,0.0,,0
5,16042788000115,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Earlswood,N G McCullagh,John M Oxx,7.0,7,2,2.0,132,18,15.0,92.0,0.0,16042788000115,16042788000007,3,21.9,1.74,950.0,19.0,24.0,25.0,20.0,18.5,21.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.052632,0.04,-0.24,-0.012632,0.0,0,0.0,,0
6,16042788000401,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Sir Lamorak,Wayne Lordan,A P O´Brien,7.0,7,2,5.0,132,18,5.0,97.0,0.0,16042788000401,16042788000007,4,6.47,3.0,950.0,5.9,5.8,5.3,5.2,5.6,6.6,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.169492,0.188679,0.113208,0.019188,0.0,0,0.0,,0
7,16042788000027,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Arturo Toscanini,Seamie Heffernan,A P O´Brien,7.0,7,2,0.0,132,18,7.0,7.0,0.0,16042788000027,16042788000007,1,13.54,1.01,36.0,10.5,11.0,10.5,10.0,12.0,13.0,11.91,-12.54,11.4,-12.5,9.03,-10.0,GB,0.0,0.0,0.095238,0.095238,0.0,0.0,0.0,0,1.0,0.0,1
8,16042788000233,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Krypton Gold,Mark Enright,Ms Sheila Lavery,7.0,7,2,2.0,132,18,101.0,63.0,0.0,16042788000233,16042788000007,13,399.07,60.0,1000.0,170.0,230.0,260.0,220.0,240.0,440.0,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.005882,0.003846,-0.346154,-0.002036,0.0,0,1.0,,0
9,16042788000070,16042788000007,2020-11-02 01:00:00,CURRAGH,HTS,Maiden-Flat,Champion Green,Shane Crosse,Joseph Patrick O´Brien,7.0,7,2,1.0,132,18,8.0,95.0,0.0,16042788000070,16042788000007,5,10.34,6.0,500.0,12.0,12.0,11.0,10.0,9.2,10.5,-1.0,0.95,-1.0,0.95,-1.0,0.95,GB,0.0,0.0,0.083333,0.090909,0.090909,0.007576,0.0,0,0.0,,0
