In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import time
from datetime import datetime, timedelta
from topaz import TopazAPI
from sklearn.preprocessing import MinMaxScaler 
import numpy as np
import requests
import os
import concurrent.futures
import math

import mapping


In [2]:
from ft_sec_key import SECKEY,TOPAZ, PW
api_key = TOPAZ #Insert your API key 
topaz_api = TopazAPI(api_key)

In [3]:
import betfairlightweight
from betfairlightweight import filters
import pandas as pd
from datetime import timedelta
from nltk.tokenize import regexp_tokenize
import warnings
import json
# data_tools.ft_sec_key import SECKEY

warnings.filterwarnings('ignore', message='The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*')

def bflw_trading():
    '''
    This function loads the credentials file, and passes the credentials into the betfairlightweight instance
    '''

    username = 'nickbarlow@live.com.au'
    password = 'un6/chxe!N!?adsp'
    app_key = 'JFWqJHqB4Akfi5hK'

    # Define the betfairlightweight client
    trading = betfairlightweight.APIClient(username, password, app_key=app_key)

    return trading

def login(trading):
    # login to the API
    trading.login_interactive()

def greyhound_market_filter():
    # Define the greyhound market filter
    market_filter = filters.market_filter(
        event_type_ids=[4339],  # For horse racing
        market_countries=['AU'],  # For Australia
        market_type_codes=['WIN']  # For win markets
    )

    return market_filter

def process_runner_books(runner_books):
    # Define the fields required from the runner book
    selection_ids = [runner_book.selection_id for runner_book in runner_books]

    df = pd.DataFrame({
        'selectionId': selection_ids,
    })
    return df

def generate_greyhound_catalogue(trading,market_filter):
    # Load the greyhound market catalogues from the Betfair API
    greyhound_market_catalogues = trading.betting.list_market_catalogue(
    filter=market_filter,
    market_projection=['RUNNER_DESCRIPTION', 'EVENT', 'MARKET_DESCRIPTION'],
    max_results='200')

    print(f"Found {len(greyhound_market_catalogues)} markets.")

    return greyhound_market_catalogues

RUNNER_DATA_COLUMNS = [
            'marketStart',
            'track',
            'raceNumber',
            'raceType',
            'winMarketId',
            'selectionId',
            'rugNumber',
            'boxNumber',
            'dogName'
            ]

def initilise_dataframe():
    # Create the empty dataframe
    data = pd.DataFrame(columns=RUNNER_DATA_COLUMNS)

    return data

PATTERN1 = r'(?<=<br>Dog ).+?(?= starts)'

PATTERN2 = r"(?<=\bbox no. )(\w+)"

def process_market_clarifications(runners_df,clarifications):
    '''
    This function accesses the market clarifications field which explains which box the reserve runner will be starting from (if any) and parses the information using regex
    We utilise this information rather than the Topaz API data because Betfair markets only use final field information

    A clarification will look like: "<br>Box changes:<br>Dog 9. Tralee Blaze starts from box no. 8<br><br>Dog 6. That Other One starts from box no. 2<br><br>"
    '''
    # Define the clarifications dataframe
    market_clarifications = pd.DataFrame(regexp_tokenize(clarifications, PATTERN1), columns = ['dogName'])

    # Remove dog name from runner_number
    market_clarifications['rugNumber'] = market_clarifications['dogName'].str.split(r'. ').str[0]

    # Extract box number from clarifications
    market_clarifications['boxNumber'] = regexp_tokenize(clarifications, PATTERN2)

    # Keep only boxNumber and rugNumber
    market_clarifications=market_clarifications[['rugNumber','boxNumber']]

    # Merge the clarifications with the original dataframe
    runners_df = pd.merge(runners_df,market_clarifications,how='left',on=['rugNumber'])

    # Any runners with no clarifications will start in the box that matches the rugNumber
    runners_df['boxNumber'].fillna(runners_df['rugNumber'],inplace=True)

    return runners_df


def collect_greyhound_market_data(trading,greyhound_market_catalogues,data):
    '''
    This function will process the greyhound market catalogue to access information about the market including:
     - Market ID
     - Market Name
     - Event Name
     - Start Time
     - Clarifications

    It will then process each individual market book to gather the runner information, following by some operations to put market information into the dataframe columns including adjusting the timezone from UTC to AEST
    Finally it will then perform some string splitting operations to generate more useful market/runner information:
     - Track
     - Race Number
     - Race Type
     - Rug Number
     - Dog Name

    These operations may be useful depending on whether the betting intention is for a specific subset of races. It is also possible to split out race distance from the market name
    '''
    # Initiate the for loop
    for market_catalogue in greyhound_market_catalogues:

        # Name variables for market parameters
        market_id = market_catalogue.market_id
        market_name = market_catalogue.market_name
        event_name = market_catalogue.event.name
        market_start_time = market_catalogue.description.market_time

        # Try to access clarifications and replace a known string replacement to prepare it for our regex functuon
        try:
            clarifications = market_catalogue.description.clarifications.replace("<br> Dog","<br>Dog")
        except AttributeError:
            clarifications = None

        # Generate our market_books list
        market_books = trading.betting.list_market_book(market_ids=[market_id])

        # Generate our runner_catalogues list
        runner_catalogues = market_catalogue.runners

        # Initiate the market_books for loop
        for market_book in market_books:

            # Call the process_runner_books function
            runners_df = process_runner_books(market_book.runners)

            # Get the runner catalogue
            for runner in market_book.runners:

                # define the runner catalogue
                runner_catalogue = next((rd for rd in runner_catalogues if rd.selection_id == runner.selection_id), None)

                # define the runner name for non-empty runner_catalogues
                if runner_catalogue is not None:
                    runner_name = runner_catalogue.runner_name
                    runners_df.loc[runners_df['selectionId'] == runner.selection_id, 'dogName'] = runner_name

            # Assign market variables to the dataframe
            runners_df['winMarketId'] = market_id
            runners_df['marketName'] = market_name
            runners_df['eventName'] = event_name
            runners_df['marketStart'] = market_start_time

            # Adjust the timezone from UTC to AEST
            runners_df['marketStart'] = runners_df['marketStart'] + timedelta(hours=10)

            # Perform string split operations 
            runners_df['track']=runners_df['eventName'].str.split(' \(').str[0]
            runners_df['raceNumber']=runners_df['marketName'].str.split(r' ').str[0]
            runners_df['raceNumber']=runners_df['raceNumber'].str.split('R').str[1]
            runners_df['raceType']=runners_df['marketName'].str.split(r'm ').str[1]
            runners_df['rugNumber']=runners_df['dogName'].str.split(r'. ').str[0]
            runners_df['dogName']=runners_df['dogName'].str.split('\. ').str[1]

            # Call the process_market_clarifications function. If there no reserve runners running then the boxNumber = rugNumber
            try:
                runners_df = process_market_clarifications(runners_df,clarifications)
            except TypeError:
                runners_df['boxNumber'] = runners_df['rugNumber']

            # concatenate the dataframes together
            data=pd.concat([data,runners_df], sort=False)

    # Keep only required columns
    data = data[RUNNER_DATA_COLUMNS]
    data = pd.DataFrame(data)

    print(data.head)

    return data

def download_betfair_market_data():
    '''
    This function combines all our previously defined functions to generate our market csv from the Betfair API
    '''
    trading = bflw_trading()

    login(trading)

    market_filter = greyhound_market_filter()

    greyhound_market_catalogues = generate_greyhound_catalogue(trading,market_filter)

    data = initilise_dataframe()

    data = collect_greyhound_market_data(trading,greyhound_market_catalogues,data)

    return data

betfair_data = download_betfair_market_data()

# def upcoming_topaz_data(codes,datatype,betfair_data):
#     '''
#     This function loads our upcoming races, discards the Topaz API boxNumber and adds the boxNumber information retrieved from the Betfair API
#     '''
#     # Load today's race information
#     TodaysTopazData = load_topaz_data(codes,datatype)

#     # Keep only required Betfair information
#     betfair_fields = betfair_data[['track','raceNumber','rugNumber','boxNumber']]

#     # Discard the Topaz API boxNumber information
#     TodaysTopazData.drop(columns=['boxNumber'], inplace=True)

#     # Merge the Betfair boxNumber information
#     TodaysTopazData = pd.merge(TodaysTopazData,betfair_fields,how='left',on=['track','raceNumber','rugNumber'])

#     return TodaysTopazData

# TodaysTopazData = (JURISDICTION_CODES,'UPCOMING',betfair_data)

# def concatenate_data(TopazDataHistorical,TodaysTopazData):

#     # Concatenate the last 12 months of Topaz Data with today's races
#     TopazDataPreProcessing = pd.concat([TopazDataHistorical,TodaysTopazData])

#     return TopazDataPreProcessing

# TopazDataPreProcessing = concatenate_data(TopazDataHistorical,TodaysTopazData)

Found 131 markets.
<bound method NDFrame.head of            marketStart        track raceNumber raceType  winMarketId  \
0  2024-05-19 12:13:00     Capalaba          2     Heat  1.229082532   
1  2024-05-19 12:13:00     Capalaba          2     Heat  1.229082532   
2  2024-05-19 12:13:00     Capalaba          2     Heat  1.229082532   
3  2024-05-19 12:13:00     Capalaba          2     Heat  1.229082532   
4  2024-05-19 12:13:00     Capalaba          2     Heat  1.229082532   
..                 ...          ...        ...      ...          ...   
3  2024-05-19 22:02:00  Broken Hill         10      Gr5  1.229080620   
4  2024-05-19 22:02:00  Broken Hill         10      Gr5  1.229080620   
5  2024-05-19 22:02:00  Broken Hill         10      Gr5  1.229080620   
6  2024-05-19 22:02:00  Broken Hill         10      Gr5  1.229080620   
7  2024-05-19 22:02:00  Broken Hill         10      Gr5  1.229080620   

   selectionId rugNumber boxNumber         dogName  
0     69503288         1         

In [38]:
# new_race_runs['dogName'] = new_race_runs['dogName'].str.upper()
betfair_data['dogName'] = betfair_data['dogName'].str.replace('.','').str.replace("'",'').str.replace('"','').str.upper()

In [40]:
model_name = "silvery-resonance-793"
pred_df = pd.read_feather(f'../model_all_price/predsVIC-{model_name} - val_all_price_df.fth')

In [41]:
pred_df['dog_name']

0         SORRY IM LATE
1            SODA DOBBY
2                RIBERO
3            BOURNE SID
4        JOHNSON STREET
             ...       
275           FAST LEGS
276           LITERALLY
277    PYRENEES CHATEAU
278              DEMURE
279           REPAYMENT
Name: dog_name, Length: 280, dtype: object

In [42]:
pred_df.track.value_counts()

track
Sale           96
Ballarat       96
Healesville    88
Name: count, dtype: int64

In [43]:
pred_df.columns

Index(['index', 'raw_margins', 'correct', 'simple', 'win_price', 'relu',
       'bet_amount_model', 'output_price', 'pred_prob', 'pred_prob2', 'prices',
       'imp_prob', 'betfair_log_loss', 'log_loss', 'label_loss', 'pred_price',
       'pred_price2', 'classes', 'track', 'onehot_win', 'dogID', 'dog_name',
       'dog_box', 'raceID', 'date', 'entropy', 'mutual_info', 'race_num',
       'loss', 'loss_bfsp', 'favorite_correct', 'one_hot_win'],
      dtype='object')

In [44]:
pred_df_simple = pred_df[['dog_name','pred_prob','pred_prob2']].rename(columns={'dog_name':'dogName'})

In [45]:
pred_df.date.max()

datetime.date(2024, 5, 19)

In [46]:
date = datetime.today().date()
pred_df['date'] = pd.to_datetime(pred_df.date).dt.date
pred_df=pred_df[pred_df['date']  == date]
date

datetime.date(2024, 5, 19)

In [47]:
pred_df['date'].iloc[0]

datetime.date(2024, 5, 19)

In [48]:
pred_df_fin = pred_df_simple.merge(betfair_data, on = 'dogName')
pred_df_fin.to_csv('betfair_model_predictions_nbarlow_18_05_2024.csv')

In [49]:
pred_df_fin

Unnamed: 0,dogName,pred_prob,pred_prob2,marketStart,track,raceNumber,raceType,winMarketId,selectionId,rugNumber,boxNumber
0,SORRY IM LATE,0.209668,0.217282,2024-05-19 16:57:00,Sale,2,Gr6/7,1.229082472,69503247,1,1
1,SODA DOBBY,0.019669,0.009261,2024-05-19 16:57:00,Sale,2,Gr6/7,1.229082472,69503248,2,2
2,RIBERO,0.301506,0.352675,2024-05-19 16:57:00,Sale,2,Gr6/7,1.229082472,69241965,3,3
3,BOURNE SID,0.033659,0.018958,2024-05-19 16:57:00,Sale,2,Gr6/7,1.229082472,63471457,4,4
4,JOHNSON STREET,0.239234,0.259066,2024-05-19 16:57:00,Sale,2,Gr6/7,1.229082472,69503249,5,5
...,...,...,...,...,...,...,...,...,...,...,...
234,FAST LEGS,0.415598,0.512106,2024-05-19 14:21:00,Ballarat,3,Mdn,1.229083568,69503333,4,4
235,LITERALLY,0.176426,0.163385,2024-05-19 14:21:00,Ballarat,3,Mdn,1.229083568,69242104,5,5
236,PYRENEES CHATEAU,0.015256,0.006247,2024-05-19 14:21:00,Ballarat,3,Mdn,1.229083568,52148496,6,6
237,DEMURE,0.142068,0.122403,2024-05-19 14:21:00,Ballarat,3,Mdn,1.229083568,55782568,7,7


In [50]:
submission = pd.read_csv('../submissions/Greyhound_Racing_Datathon_2024_Submission_Form_Model_Name_20240519.csv')

In [51]:
submission.columns

Index(['market_start', 'venue', 'race_no', 'win_market_id', 'selection_id',
       'tab_number', 'runner_name', 'probability'],
      dtype='object')

In [52]:
submission_merged = submission.merge(pred_df_fin[['selectionId','pred_prob','pred_prob2']].rename(columns={'selectionId':'selection_id'}), on='selection_id',how='inner')

In [54]:
submission_merged.to_csv('../submissions/Greyhound_Racing_Datathon_2024_Submission_InLimbo_20240519.csv',index=False)

s

In [None]:
sub = pd.read_excel('../submissions/Greyhound_Racing_Datathon_2024_Submission_InLimbo_20240518.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '../submissions/Greyhound_Racing_Datathon_2024_Submission_InLimbo_20240518.xlsx'

In [None]:
sub

Unnamed: 0,market_start,venue,race_no,win_market_id,selection_id,tab_number,runner_name,probability
0,2024-05-17 15:02:00,Bendigo,1,1.228987,69421826,1,Evies Promise,0.402649
1,2024-05-17 15:02:00,Bendigo,1,1.228987,68235127,2,Calamity Kait,0.034972
2,2024-05-17 15:02:00,Bendigo,1,1.228987,69421827,3,Merls Lad,0.223170
3,2024-05-17 15:02:00,Bendigo,1,1.228987,69421828,4,Borough Bound,0.049710
4,2024-05-17 15:02:00,Bendigo,1,1.228987,64458909,5,Little Tic,0.135942
...,...,...,...,...,...,...,...,...
234,2024-05-17 22:48:00,Geelong,12,1.228987,61028130,4,Ventura Bale,0.173609
235,2024-05-17 22:48:00,Geelong,12,1.228987,48821690,5,Talk Out Loud,0.036829
236,2024-05-17 22:48:00,Geelong,12,1.228987,43256898,6,Who Told Mel,0.058196
237,2024-05-17 22:48:00,Geelong,12,1.228987,49514111,7,Crackerjack Pot,0.081224


In [None]:
pred_df.dog_name

0      DESTINI PALADIN
1       DUSTY OLD ROAD
2              no_name
3              no_name
4              no_name
            ...       
275     JAYVILLE MOLLY
276     WREAKING HAVOC
277      DARNUM DIESEL
278     CHEAP RED WINE
279            no_name
Name: dog_name, Length: 280, dtype: object

In [None]:
sub

Unnamed: 0,market_start,venue,race_no,win_market_id,selection_id,tab_number,runner_name,probability
0,2024-05-17 15:02:00,Bendigo,1,1.228987,69421826,1,Evies Promise,0.402649
1,2024-05-17 15:02:00,Bendigo,1,1.228987,68235127,2,Calamity Kait,0.034972
2,2024-05-17 15:02:00,Bendigo,1,1.228987,69421827,3,Merls Lad,0.223170
3,2024-05-17 15:02:00,Bendigo,1,1.228987,69421828,4,Borough Bound,0.049710
4,2024-05-17 15:02:00,Bendigo,1,1.228987,64458909,5,Little Tic,0.135942
...,...,...,...,...,...,...,...,...
234,2024-05-17 22:48:00,Geelong,12,1.228987,61028130,4,Ventura Bale,0.173609
235,2024-05-17 22:48:00,Geelong,12,1.228987,48821690,5,Talk Out Loud,0.036829
236,2024-05-17 22:48:00,Geelong,12,1.228987,43256898,6,Who Told Mel,0.058196
237,2024-05-17 22:48:00,Geelong,12,1.228987,49514111,7,Crackerjack Pot,0.081224


In [None]:
sub['dog_name'] = sub['runner_name'].str.replace('.','').str.replace("'",'').str.replace('"','').str.upper()

In [None]:
sub.merge(pred_df, left_on='dog_name', right_on='dog_name', how='left').to_excel('../submissions/test.xlsx',index=False)

In [None]:
# new_race_runs_merged = new_race_runs.merge(betfair_df_dogs_only ,how='inner', on='dogName')

In [None]:
import torch.nn.functional as F
import torch
def pd_softmax(x):
    x = torch.tensor(list(x))
    x = F.softmax(x, dim=0)
    return x.numpy()

In [None]:
submission_merged.groupby('win_market_id',sort=False).apply(lambda x: pd_softmax(x['pred_prob'])).tolist()

[array([0.18849   , 0.17420626, 0.17957641, 0.21921717, 0.23851018],
       dtype=float32),
 array([0.15281205, 0.17257346, 0.14641863, 0.19442493, 0.14678824,
        0.18698272], dtype=float32),
 array([0.11840022, 0.11177149, 0.10957627, 0.16575077, 0.1304922 ,
        0.11106802, 0.1260849 , 0.1268561 ], dtype=float32),
 array([0.14836417, 0.11660397, 0.12521355, 0.14293487, 0.11786979,
        0.11987338, 0.11591206, 0.11322825], dtype=float32),
 array([0.15806764, 0.1806787 , 0.14583397, 0.142339  , 0.17156258,
        0.20151813], dtype=float32),
 array([0.1119795 , 0.1421925 , 0.13150015, 0.12330826, 0.11311845,
        0.11136733, 0.11974572, 0.14678818], dtype=float32),
 array([0.11356873, 0.11434497, 0.13475898, 0.11062954, 0.15119211,
        0.12775455, 0.13510752, 0.11264369], dtype=float32),
 array([0.14117089, 0.15423873, 0.1428199 , 0.14148337, 0.13182694,
        0.14364566, 0.14481442], dtype=float32),
 array([0.11192398, 0.11129102, 0.12452698, 0.13055874, 0.1109516

In [None]:
runs_df = pd.concat([pd.read_feather('server_data/'+x) for x in os.listdir('server_data/')])
runs_df = runs_df.dropna(subset=['place'], how='all')    
runs_df.drop_duplicates(inplace=True)
runs_df['date'] = pd.to_datetime(runs_df['meetingDate']).dt.date
runs_df['year-month'] = pd.to_datetime(runs_df['meetingDate']).dt.to_period('M')
runs_df['year'] = pd.to_datetime(runs_df['meetingDate']).dt.to_period('Y')

In [None]:
TrackDict = {
    'Auckland (NZ)':'Manukau',
    'Christchurch (NZ)':'Addington',
    'Dport @ HOB':'Hobart',
    'Dport @ LCN':'Launceston',
    'Meadows (MEP)':'The Meadows',
    'Otago (NZ)':'Forbury Park',
    'Palmerston Nth (NZ)':'Manawatu',
    'Sandown (SAP)':'Sandown Park',
    'Southland (NZ)':'Ascot Park',
    'Tokoroa (NZ)':'Tokoroa',
    'Waikato (NZ)':'Cambridge',
    'Wanganui (NZ)':'Hatrick',
    'Taranaki (NZ)':'Taranaki',
    'Ashburton (NZ)':'Ashburton',
    'Richmond (RIS)':'Richmond Straight',
    'Murray Bridge (MBR)':'Murray Bridge',
    'Murray Bridge (MBS)':'Murray Bridge Straight'
}
TopazData = runs_df.copy()
TopazData['track'] = TopazData['track'].replace(TrackDict)
TopazData['meetingDate'] = pd.to_datetime(TopazData['meetingDate'])
TopazData['dateWhelped'] = pd.to_datetime(TopazData['dateWhelped'])

TopazData['dogName']=TopazData['dogName'].str.replace("'","")
TopazData['sireName']=TopazData['sireName'].str.replace("'","")
TopazData['damName']=TopazData['damName'].str.replace("'","")
state_map = {x['trackName']:x['State'] for x in mapping.trackCodes}
TopazData['state'] = TopazData['track'].map(state_map)

In [None]:
TopazData = TopazData.query('state == "VIC"')

In [None]:
TopazData.date.max()

In [None]:
start_date = datetime(2024,4,27)

In [None]:
new_races = topaz_api.get_races(start_date)
new_races

In [None]:
new_races.columns

In [None]:
new_race_ids = new_races['raceId'].unique()
new_race_ids

In [None]:
race_run = topaz_api.get_race_runs(race_id = 1029602600)

In [None]:
race_run['meetingDate'] = pd.to_datetime(race_run['meetingDate'])

In [None]:
race_run

In [None]:
TopazData.columns

In [None]:
new_w_form = pd.concat([TopazData,race_run])

In [None]:
new_w_form

In [None]:
TopazData = Topazbackup

In [None]:
Topazbackup = TopazData.copy()


In [None]:
TopazData = new_w_form.copy()

In [None]:
TopazData

In [None]:
# TopazData['last5'] = TopazData['last5'].astype(str)
scaler = MinMaxScaler()
TopazData['track'] = TopazData['track'].replace(TrackDict)
TopazData['meetingDate'] = pd.to_datetime(TopazData['meetingDate'])
TopazData['dateWhelped'] = pd.to_datetime(TopazData['dateWhelped'])

TopazData['dogName']=TopazData['dogName'].str.replace("'","")
TopazData['sireName']=TopazData['sireName'].str.replace("'","")
TopazData['damName']=TopazData['damName'].str.replace("'","")
# # Convert the 'pir' column to string
try:
    TopazData['pir'] = TopazData['pir'].fillna(0)
    TopazData['pir'] = TopazData['pir'].astype(int).astype(str)
except ValueError:
    print('Error converting pir to string')
    TopazData['pir'] = '000'

# # Extract the second last letter and create a new column '2ndLastPIR'
TopazData['2ndLastPIR'] = TopazData['pir'].apply(lambda x: x[-2] if len(x) >= 2 else None)
TopazData['2ndLastPIR'] = TopazData['2ndLastPIR'].fillna(TopazData['place']).fillna(0)
TopazData['2ndLastPIR'] = TopazData['2ndLastPIR'].astype(int)

# # Create a feature that calculates places gained/conceded in the home straight
TopazData['finishingPlaceMovement'] = TopazData['2ndLastPIR'] - TopazData['place']

TopazData['weightInKgScaled'] = TopazData.groupby('raceId')['weightInKg'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten() if x.nunique() > 1 else 0)
# TopazData['weightInKgScaled'] = TopazData.groupby('raceId')['weightInKg'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten() if x.nunique() > 1 else 0)
#Scale values as required
TopazData['prizemoneyLog'] = np.log10(TopazData['prizeMoney'] + 1)
TopazData['placeLog'] = np.log10(TopazData['place'] + 1)
TopazData['marginLog'] = np.log10(TopazData['resultMargin'] + 1)

# Calculate median winner time per track/distance
win_results = TopazData[TopazData['place'] == 1]

grouped_data = win_results.groupby(['track', 'distance', 'meetingDate'])['resultTime'].median().reset_index()

median_win_time = pd.DataFrame(grouped_data.groupby(['track', 'distance']).apply(lambda x: x.sort_values('meetingDate').set_index('meetingDate')['resultTime'].shift(1).rolling('365D', min_periods=1).median())).reset_index()
median_win_time.rename(columns={"resultTime": "runTimeMedian"},inplace=True)

median_win_time['speedIndex'] = (median_win_time['runTimeMedian'] / median_win_time['distance'])
median_win_time['speedIndex'] = MinMaxScaler().fit_transform(median_win_time[['speedIndex']])

TopazData = TopazData.merge(median_win_time, how='left', on=['track', 'distance','meetingDate'])

TopazData['runTimeNorm'] = (TopazData['runTimeMedian'] / TopazData['resultTime']).clip(0.8, 1.2)
TopazData['runTimeNorm'] = MinMaxScaler().fit_transform(TopazData[['runTimeNorm']])

#Same for split time
split_win_results = TopazData[TopazData['position_1'] == 1]
grouped_data = split_win_results.groupby(['track', 'distance', 'meetingDate'])['time_1'].median().reset_index()

median_split_win_time = pd.DataFrame(grouped_data.groupby(['track', 'distance']).apply(lambda x: x.sort_values('meetingDate').set_index('meetingDate')['time_1'].shift(1).rolling('365D', min_periods=1).median())).reset_index()
median_split_win_time.rename(columns={'time_1': 'split_time_median'},inplace=True)

median_split_win_time['split_speedIndex'] = median_split_win_time['split_time_median']
median_split_win_time['split_speedIndex'] = MinMaxScaler().fit_transform(median_split_win_time[['split_time_median']])

# Merge with median winner time
TopazData = TopazData.merge(median_split_win_time, how='left', on=['track', 'distance','meetingDate'])

# Normalise time comparison
TopazData['split_runTimeNorm'] = (TopazData['split_time_median'] / TopazData['time_1']).clip(0.5, 1.5)
TopazData['split_runTimeNorm'] = MinMaxScaler().fit_transform(TopazData[['split_runTimeNorm']])

min_run_time = TopazData.groupby('raceId')[['time_1']].min().reset_index().rename(columns={'time_1':'min_run_time'})
TopazData = TopazData.merge(min_run_time, on='raceId')
TopazData['split_time_margin'] = TopazData['time_1']-TopazData['min_run_time']
# Same for runhome time
TopazData['run_home_time'] = TopazData['resultTime'] - TopazData['time_1']
win_results = TopazData[TopazData['place'] == 1]


grouped_data = win_results.groupby(['track', 'distance', 'meetingDate'])['run_home_time'].median().reset_index()

median_win_time = pd.DataFrame(grouped_data.groupby(['track', 'distance']).apply(lambda x: x.sort_values('meetingDate').set_index('meetingDate')['run_home_time'].shift(1).rolling('365D', min_periods=1).median())).reset_index()
median_win_time.rename(columns={'run_home_time': "run_home_TimeMedian"},inplace=True)

median_win_time['run_home_speedIndex'] = (median_win_time['run_home_TimeMedian'] / median_win_time['distance'])
median_win_time['run_home_speedIndex'] = MinMaxScaler().fit_transform(median_win_time[['run_home_speedIndex']])

TopazData = TopazData.merge(median_win_time, how='left', on=['track', 'distance','meetingDate'])

TopazData['run_home_TimeNorm'] = (TopazData['run_home_TimeMedian'] / TopazData['run_home_time']).clip(0.8, 1.2)
TopazData['run_home_TimeNorm'] = MinMaxScaler().fit_transform(TopazData[['runTimeNorm']])

# Sort the DataFrame by 'RaceId' and 'Box'
TopazData = TopazData.sort_values(by=['raceId', 'boxNumber'])

# Check if there is an entry equal to boxNumber + 1
TopazData['hasEntryBoxNumberPlus1'] = (TopazData.groupby('raceId')['boxNumber'].shift(1) == TopazData['boxNumber'] + 1) | (TopazData['boxNumber'] == 8)
TopazData['hasEntryBoxNumberMinus1'] = (TopazData.groupby('raceId')['boxNumber'].shift(-1) == TopazData['boxNumber'] - 1)
# Convert boolean values to 1
TopazData['hasEntryBoxNumberPlus1'] = TopazData['hasEntryBoxNumberPlus1'].astype(int)
TopazData['hasEntryBoxNumberMinus1'] = TopazData['hasEntryBoxNumberMinus1'].astype(int)
# Display the resulting DataFrame which shows adjacent Vacant Boxes
# Box 1 is treated as having a vacant box to the left always as we are looking how much space the dog has to move.
TopazData['adjacentVacantBoxes'] = 2 - TopazData['hasEntryBoxNumberPlus1'] - TopazData['hasEntryBoxNumberMinus1']
# Calculate 'hasAtLeast1VacantBox'
TopazData['hasAtLeast1VacantBox'] = (TopazData['adjacentVacantBoxes'] > 0).astype(int)

TopazData['win'] = TopazData['place'].apply(lambda x: 1 if x == 1 else 0)

grouped_data = TopazData.groupby(['track', 'distance', 'boxNumber', 'hasAtLeast1VacantBox', 'meetingDate'])['win'].mean().reset_index()
grouped_data.set_index('meetingDate', inplace=True)

# Apply rolling mean calculation to the aggregated data
box_win_percent = grouped_data.groupby(['track', 'distance', 'boxNumber', 'hasAtLeast1VacantBox']).apply(lambda x: x.sort_values('meetingDate')['win'].shift(1).rolling('365D', min_periods=1).mean()).reset_index()

# Reset index and rename columns
box_win_percent.columns = ['track', 'distance', 'boxNumber', 'hasAtLeast1VacantBox', 'meetingDate', 'rolling_box_win_percentage']

# Add to dog results dataframe
TopazData = TopazData.merge(box_win_percent, on=['track', 'distance', 'meetingDate','boxNumber','hasAtLeast1VacantBox'], how='left')

# resultMargin has the same value for 1st and 2nd placed dogs, but should be 0 for the 1st placed dog.
TopazData.loc[TopazData['place'] == 1, ['resultMargin']] = 0

TopazData['dogAge'] = (TopazData['meetingDate'] - TopazData['dateWhelped']).dt.days
scaler = MinMaxScaler()
TopazData['dogAgeScaled'] = TopazData.groupby('raceId')['dogAge'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())
TopazData['averageSpeed'] = TopazData['distance'] / TopazData['resultTime']

In [None]:
TopazData['split_time_margin'] 

In [None]:
for i,c in enumerate(TopazData.columns):
    print(i,c,TopazData[c].nunique())

In [None]:
TopazData.speedIndex

In [None]:
TopazData['averageSpeed'] = TopazData['distance'] / TopazData['resultTime']
TopazData['splitMargin_1'] = np.where(TopazData['position_1'] == 1, 0, TopazData['splitMargin_1'])
TopazData['margin_from_lengths'] = pd.to_numeric(TopazData['resultMarginLengths'].str.replace('L',''))
# TopazData['win']

In [None]:
TopazData['resultMargin'] = TopazData['margin_from_lengths']

In [None]:
TopazData.shape

In [None]:
TopazData.margin_from_lengths

In [None]:
TopazData.columns

In [None]:
def rolling_last(x):
    return x.iloc[-1]

In [None]:
import itertools

dataset = TopazData.copy()
print(f"dataset shape: {dataset.shape}")
dataset['meetingDate'] = pd.to_datetime(dataset['meetingDate'])

# Calculate values for dog, trainer, dam and sire
subsets = ['dog', 'trainer', 'dam', 'sire']
# subsets = ['dog']

# Use rolling window of 28, 91 and 365 days
# rolling_windows = ['28D','91D', '365D']
rolling_windows = [1,'28D','91D', '365D']
rolling_windows = [1,'365D']
# rolling_windows = [1]

# Features to use for rolling windows calculation
features = ['distance','boxNumber','runTimeNorm', 'placeLog', 'prizemoneyLog', 
            'marginLog','finishingPlaceMovement','splitMargin_1','split_runTimeNorm','run_home_TimeNorm','finishingPlaceMovement',
            'time_1','averageSpeed']

features = ['distance','boxNumber','runTimeNorm', 'place', 'resultMargin',
            'split_time_margin','split_runTimeNorm','time_1','run_home_TimeNorm','finishingPlaceMovement',
            'averageSpeed', 'win']


dam_features = ['distance','boxNumber','runTimeNorm', 'place', 'resultMargin','split_time_margin','split_runTimeNorm',
            'time_1','averageSpeed', 'win']


# Aggregation functions to apply

aggregates = ['mean']

# Keep track of generated feature names
feature_cols = []

for i in subsets:
    # Generate rolling window features
    idnumber = i + 'Id'

    subset_dataframe = dataset[['meetingDate',idnumber] + features]
    average_df = pd.DataFrame()

    for feature in features:
        # Group by 'damId' and 'meetingDate' and calculate the average of the current feature
        feature_average_df = subset_dataframe.groupby([idnumber, 'meetingDate'])[feature].mean().reset_index()
        # Rename the feature column to indicate it's the average of that feature
        feature_average_df.rename(columns={feature: f'{feature}{i}DayAverage'}, inplace=True)

        # If average_df is empty, assign the feature_average_df to it
        if average_df.empty:
            average_df = feature_average_df
        else:
            # Otherwise, merge feature_average_df with average_df
            average_df = pd.merge(average_df, feature_average_df, on=[idnumber, 'meetingDate'],how='left')

        # Assuming df is your DataFrame
    column_names = average_df.columns.tolist()
    # Columns to exclude
    columns_to_exclude = [idnumber,'meetingDate']
    # Exclude specified columns from the list
    column_names_filtered = [col for col in column_names if col not in columns_to_exclude]

    average_df.drop_duplicates(inplace=True)
    average_df['meetingDate'] = pd.to_datetime(average_df['meetingDate'])
    average_df = average_df.set_index([idnumber, 'meetingDate']).sort_index() 


    #Process Dog Stats
    for rolling_window in rolling_windows:
        print(f"dataset shape: {dataset.shape}")
        print(f'Processing {i} rolling window {rolling_window} days')

        rolling_result = (
            average_df
            .reset_index(level=0)
            .groupby(idnumber)[column_names_filtered]
            .rolling(rolling_window)  # Use timedelta for rolling window
            .agg(aggregates)
            .groupby(level=0)
            .shift(1)
        )

        # Generate list of rolling window feature names (eg: RunTime_norm_min_365D)
        agg_features_cols = [f'{i}_{f}_{a}_{rolling_window}' for f, a in itertools.product(features, aggregates)]
        # Add features to dataset
        average_df[agg_features_cols] = rolling_result
        # Keep track of generated feature names
        feature_cols.extend(agg_features_cols)
        average_df.fillna(0, inplace=True)

    

    average_df.reset_index(inplace=True)
    dataset = pd.merge(dataset,average_df,on=[idnumber, 'meetingDate'],how='left')

# Only keep data 12 months after the start date of your dataset since we've used a 365D rolling timeframe for some features
# feature_cols = np.unique(feature_cols).tolist()
# dataset = dataset[dataset['meetingDate'] >= '2021-01-01']

dataset = dataset[[
                'meetingDate',
                'state',
                'track',
                'distance',
                'raceId',
                'raceTypeCode',
                'raceNumber',
                'boxNumber',
                'rugNumber',
                'runId',
                'dogId',
                'dogName',
                'weightInKg',
                'sex',
                'trainerId',
                'trainerState',
                'damId',
                'damName',
                'sireId',
                'sireName',
                'win',
                'place',
                'resultTime',
                'resultMargin',
                'resultMarginLengths',
                'dogAgeScaled',
                'startPrice',
                # 'lastFiveWinPercentage',
                # 'lastFivePlacePercentage',
                'weightInKgScaled',
                'rolling_box_win_percentage',
                'hasEntryBoxNumberPlus1', 
                'hasEntryBoxNumberMinus1',]
                 + feature_cols
                ]

feature_cols.extend(['dogAgeScaled',
                     'boxNumber',
                # 'lastFiveWinPercentage',
                # 'lastFivePlacePercentage',
                'weightInKgScaled',
                'hasEntryBoxNumberPlus1', 'hasEntryBoxNumberMinus1',
                'rolling_box_win_percentage'])

#The below line will output your dataframe to a csv but may be too large to open in Excel.
#dataset.to_csv('testing.csv',index=False)

In [None]:
feature_cols = ['dogAgeScaled',
                     'boxNumber',
                # 'lastFiveWinPercentage',
                # 'lastFivePlacePercentage',
                'weightInKgScaled',
                'hasEntryBoxNumberPlus1', 'hasEntryBoxNumberMinus1',
                'rolling_box_win_percentage'] + [x for x in feature_cols if (('dog' in x) and ('_1' in x)) or (('dog' not in x) and ('_365' in x))]
feature_cols

In [None]:
feature_cols = ['dogAgeScaled',
 'boxNumber',
 'weightInKgScaled',
 'hasEntryBoxNumberPlus1',
 'hasEntryBoxNumberMinus1',
 'rolling_box_win_percentage',
 'dog_distance_mean_1',
 'dog_boxNumber_mean_1',
 'dog_runTimeNorm_mean_1',
 'dog_place_mean_1',
 'dog_resultMargin_mean_1',
 'dog_split_time_margin_mean_1',
 'dog_split_runTimeNorm_mean_1',
 'dog_time_1_mean_1',
 'dog_run_home_TimeNorm_mean_1',
 'dog_finishingPlaceMovement_mean_1',
 'dog_averageSpeed_mean_1',
 'dog_win_mean_1',
 'trainer_distance_mean_365D',
 'trainer_boxNumber_mean_365D',
 'trainer_runTimeNorm_mean_365D',
 'trainer_place_mean_365D',
 'trainer_resultMargin_mean_365D',
 'trainer_split_time_margin_mean_365D',
 'trainer_split_runTimeNorm_mean_365D',
 'trainer_time_1_mean_365D',
 'trainer_run_home_TimeNorm_mean_365D',
 'trainer_finishingPlaceMovement_mean_365D',
 'trainer_averageSpeed_mean_365D',
 'trainer_win_mean_365D',
 'dam_distance_mean_365D',
 'dam_boxNumber_mean_365D',
 'dam_runTimeNorm_mean_365D',
 'dam_place_mean_365D',
 'dam_resultMargin_mean_365D',
 'dam_split_time_margin_mean_365D',
 'dam_split_runTimeNorm_mean_365D',
 'dam_time_1_mean_365D',
 'dam_run_home_TimeNorm_mean_365D',
 'dam_finishingPlaceMovement_mean_365D',
 'dam_averageSpeed_mean_365D',
 'dam_win_mean_365D',
 'sire_distance_mean_365D',
 'sire_boxNumber_mean_365D',
 'sire_runTimeNorm_mean_365D',
 'sire_place_mean_365D',
 'sire_resultMargin_mean_365D',
 'sire_split_time_margin_mean_365D',
 'sire_split_runTimeNorm_mean_365D',
 'sire_time_1_mean_365D',
 'sire_run_home_TimeNorm_mean_365D',
 'sire_finishingPlaceMovement_mean_365D',
 'sire_averageSpeed_mean_365D',
 'sire_win_mean_365D']

In [None]:
dataset_pred = dataset.query('raceId == 1029602600')

In [None]:
dataset.dog_split_time_margin_mean_1.value_counts()

In [None]:
single_dog = dataset.query('dogId == 713187529')
single_dog

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assume df is your DataFrame and 'date' is your date column
df = TopazData.copy()
df['date'] = pd.to_datetime(df['date']).dt.to_period('M')

# List of relevant columns
columns = ['distance','boxNumber','runTimeNorm', 'place', 'resultMargin','splitMargin_1','split_runTimeNorm',
            'time_1','averageSpeed', 'win']  # replace with your actual column names

# Create a separate plot for each column
for col in columns:
    # Group by 'date' and calculate the percentage of missing values in each month for the column
    missing_data = df.groupby('date')[col].apply(lambda x: x.isnull().mean() * 100)

    # Plot the results
    missing_data.plot(kind='bar', figsize=(12, 6))
    plt.title(f'Percentage of missing values per month for {col}')
    plt.ylabel('Percentage of missing values')
    plt.show()

In [None]:
dataset.to_feather('topaz_data_dog.fth')

In [None]:
dataset.dogName

In [None]:
betfair_df = pd.read_feather('../data_tools/DATA/df-betfairSP.fth')

In [None]:
betfair_df['date'] =  (pd.to_datetime(betfair_df['EVENT_DT'],dayfirst=True) + pd.Timedelta(hours=7)).dt.date
betfair_df['dogName'] = betfair_df.dog.str.strip().str.upper().str.replace('.','').replace("'",'')
# betfair_df.to_csv('betfair_bsp.csv')

In [None]:
dataset['date'] = pd.to_datetime(dataset['meetingDate']).dt.date
dataset['dogName'] = dataset['dogName'].str.upper().str.replace('.','').str.replace("'",'')

In [None]:
dataset_bsp = dataset.merge(betfair_df,on=['date','dogName'],how='left')

In [None]:
def generate_prev_race(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    original_cols = df_in.columns
    df[f'prev_race'] = df_g['raceId'].shift(1).fillna('-1').astype('string')
    df[f'prev_race_date'] = df_g['date'].shift(1).fillna('-1').astype('string')
    df[f'prev_race_track'] = df_g['track'].shift(1).fillna('-1').astype('string')
    df[f'prev_race_state'] = df_g['state'].shift(1).fillna('-1').astype('string')
    df[f'next_race'] = df_g['raceId'].shift(-1).fillna('-1').astype('string')
    return(df)

In [None]:
dataset_bsp = generate_prev_race(dataset_bsp,dataset_bsp.groupby('dogId'))

In [None]:
feature_cols

In [None]:
feature_cols = ['dog_distance_min_1',
 'dog_boxNumber_min_1',
 'dog_runTimeNorm_min_1',
 'dog_place_min_1',
 'dog_resultMargin_min_1',
 'dog_split_time_margin_min_1',
 'dog_split_runTimeNorm_min_1',
 'dog_time_1_min_1',
 'dog_averageSpeed_min_1',
 'dog_win_min_1',
 'dogAgeScaled',
 'boxNumber',
 'weightInKgScaled',
 'hasEntryBoxNumberPlus1',
 'hasEntryBoxNumberMinus1',
 'rolling_box_win_percentage']

In [None]:
feature_cols[0:17]

In [None]:
for i in dataset.columns:
    print(i)

In [None]:
stat_values =  pd.Series( dataset_bsp[feature_cols].fillna(-1.0).astype('float32').values.tolist())
dataset_bsp['stats_topaz'] = stat_values
dataset_bsp['dogid'] = dataset_bsp['dogId'].astype('str')
dataset_bsp['raceid'] = dataset_bsp['raceId'].astype('str')
dataset_bsp['stats_cols'] = str(feature_cols)

In [None]:
# dataset_bsp.to_feather('topaz_data_w_bsp.fth')

In [None]:
# dataset_bsp = pd.read_feather('topaz_data_w_bsp.fth')

In [None]:
def hash_trackname(trackname, hash_size=1024):
    hashes = [hash(x) % hash_size for x in trackname]
    return hashes

In [None]:
hashes = {x: hash(x)%1024 for x in dataset_bsp.track.unique()}

In [None]:
dataset_bsp['track_hash'] = dataset_bsp['track'].map(hashes)

In [None]:
dataset_bsp.to_feather('topaz_data_w_bsp.fth')

In [None]:
dataset_bsp.columns

In [None]:
dataset.query('raceId == 618363326').place

In [None]:
len(dataset_bsp['stats_topaz'].iloc[0]

In [None]:
df = dataset_bsp.copy()
col = 'BSP'
df = df[df['state']!='NZ']
df['date'] = pd.to_datetime(df['date']).dt.to_period('Y')
missing_data = df.groupby(['date','state'])[col].apply(lambda x: x.isnull().mean() * 100)

# Plot the results
missing_data.plot(kind='bar', figsize=(40, 6))
plt.title(f'Percentage of missing values per month for {col}')
plt.ylabel('Percentage of missing values')
plt.show()

In [None]:
dataset = pd.read_feather('../data/topaz_data_w_bsp.fth')

In [None]:
for i,c in enumerate(dataset.columns):
    print(i,c)

In [None]:
dataset.dog_split_runTimeNorm_min_1.value_counts()

In [None]:
dataset.to_csv('../data/topaz_data_w_bsp.csv')

In [None]:
dataset.shape

In [None]:
feature_cols

In [None]:
for col in dataset.columns:
    print(col)

In [None]:
feature_cols

In [None]:
dataset.raceId.nunique()


In [None]:
# Your existing function to generate date range
def generate_date_range(start_date, end_date):
    start_date = start_date
    end_date = end_date

    date_list = []
    current_date = start_date
    while current_date <= end_date:
        date_list.append(current_date.strftime("%Y-%m-%d"))
        current_date += timedelta(days=31)

    return date_list

# Example usage:
start_date = datetime(2024,1,1)
end_date = (datetime.today() + timedelta(days=31))

# Generate the date range
date_range = generate_date_range(start_date, end_date)

In [None]:


def topaz_races_threaded(buckets, topaz_api, progress):
    all_races = []
    # print(f"{buckets=}")
    errors = []
    for bucket in buckets:
        start_date, end_date, state = bucket
        # print(bucket)
        try:
            races = topaz_api.get_races(from_date=start_date, to_date=end_date, owning_authority_code=state)
            races['state'] = state
            all_races.append(races)
        except requests.HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
            errors.append(bucket)
            pass
        # time.sleep(2)
        progress.update()
    return all_races,errors

def get_topaz_races(start_date, end_date, states, topaz_api:TopazAPI):
    date_range = generate_date_range(start_date, end_date)
    starts = date_range[:-1]
    ends = date_range[1:]
    date_range_states = [(start, end, state) for start, end in zip(starts, ends) for state in states]

    print(f"Created {len(date_range_states)} date ranges for {len(states)} states")

    num_workers = min(6, len(date_range_states))  # Adjust this value based on your system's capabilities
    chunk_size = math.ceil(len(date_range_states) / num_workers)

    chunks = [date_range_states[i:i + chunk_size] for i in range(0, len(date_range_states), chunk_size)]
    
    print(chunks)
    print(len(chunks))
    _process_jobs = []
    bars = []
    results = []
    errors = []
    for i in range(num_workers):
        bars.append(tqdm(total=len(chunks[i]), position=i)) 
        # time.sleep(2)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:


        for i,chunk in enumerate(chunks):
            _process_jobs.append(executor.submit(topaz_races_threaded, chunk, topaz_api, bars[i]))

        # results = []
        for job in concurrent.futures.as_completed(_process_jobs):
            result,error = job.result()
            errors.extend(error)
            results.append(result)

    

    # results = []
    print(errors)
    return results

In [None]:
start_date = datetime(2015,1,1)
end_date = (datetime.today() + timedelta(days=31))
states = states = ['NSW', 'VIC', 'NZ', 'QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT']
states = ['NZ']
states = ['SA']

In [None]:
output = get_topaz_races(start_date, end_date, states, topaz_api)
output_flat = [item for sublist in output for item in sublist]
all_races_df = pd.concat(output_flat,ignore_index=True).reset_index(drop=True)
all_races_df.to_csv('all_races_topas_SA.csv',index=False)

In [None]:
import torch
from torch import nn

# Assume you have 1000 unique track names and you want to create an embedding of size 50 for each track name
num_tracknames = 1000
embedding_dim = 50

# Create an embedding layer
embedding = nn.Embedding(num_tracknames, embedding_dim)

# Assume trackname_indices is a tensor of integers, where each integer is the index of a track name in the dictionary
# For example, you can create it by replacing each track name in your data with its index in the dictionary of track names
trackname_indices = torch.tensor([0, 1, 2, 3, 4])  # replace with your actual data

# Get the embeddings of the track names
trackname_embeddings = embedding(trackname_indices)

print(trackname_embeddings)

In [None]:
all_races_df['date'] = pd.to_datetime(all_races_df['raceStart']).dt.date

In [None]:
meeting_ids = list(all_races_df['meetingId'].unique())

In [None]:
def topaz_meeting_runs_threaded(chunk,topaz_api:TopazAPI,progress):
    race_runs = []
    race_results = []
    errors = []
    for race_id in chunk:
        try:
            # race_run = topaz_api.get_race_runs(race_id=race_id)
            # race_runs.append(race_run)
            time.sleep(0.3)
            race_result_json = topaz_api.get_race_result(race_id = race_id)
            try:
                race_run.to_feather(f"race_runs/{race_id}_run.fth")
                race_result_df = pd.DataFrame.from_dict([race_result_json])
                race_result_df.to_feather(f"results/{race_id}_results.fth")
            except Exception as e:
                print(e)

        except requests.HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
            if http_err.response.status_code == 429:

                time.sleep(120)
            errors.append(race_id)
            pass
        progress.update()

    return race_runs,race_results,errors

def topaz_meeting_run_getter(race_id_list,topaz_api:TopazAPI):

    print(f"Fetching data for  {len(race_id_list)}")

    num_workers = 6
    chunk_size = math.ceil(len(race_id_list) / num_workers)

    chunks = [race_id_list[i:i + chunk_size] for i in range(0, len(race_id_list), chunk_size)]
    
    print(chunks)
    print(len(chunks))
    _process_jobs = []
    bars = []
    race_runs = []
    results = []
    errors = []
    for i in range(num_workers):
        bars.append(tqdm(total=len(chunks[i]), position=i)) 
        # time.sleep(2)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:


        for i,chunk in enumerate(chunks):
            _process_jobs.append(executor.submit(topaz_meeting_runs_threaded, chunk, topaz_api, bars[i]))

        # results = []
        for job in concurrent.futures.as_completed(_process_jobs):
            race_run,result_json,error = job.result()
            race_runs.extend(race_run)
            errors.extend(error)
            results.extend(result_json)

    

    # results = []
    print(errors)
    return race_runs,results,errors

In [None]:
race_ids = list(all_races_df['raceId'].unique())

In [None]:
race_id = 837931333

In [None]:
race_run = topaz_api.get_race_runs(race_id=race_id)
race_result_json,response = topaz_api.get_race_result(race_id=race_id)

In [None]:
rate_lim_left = int(response.headers['ratelimit-remaining'])
reset_time = int(response.headers['ratelimit-reset'])

In [None]:
reset_time

In [None]:
race_run_from_json = pd.DataFrame(race_result_json['runs'])

In [None]:
race_run_from_json 

In [None]:
split_times = pd.DataFrame(race_result_json['splitTimes'])
split_times_1 = split_times[split_times['splitTimeMarker'] == 1][['runId','time','position','splitMargin']]
split_time_2 = split_times[split_times['splitTimeMarker'] == 2][['runId','time','position','splitMargin']]
split_times = split_times_1.merge(split_time_2, on='runId',suffixes=('_1','_2'),how='left')

In [None]:
split_times

In [None]:
race_run_from_json 

In [None]:
race_result_json

In [None]:
race_result_json_meeting = topaz_api.get_meeting_details(meeting_id = 809592457)

In [None]:
race_result_df

In [None]:
def topaz_race_runs_threaded(chunk,topaz_api:TopazAPI,progress):
    race_runs = []
    race_results = []
    errors = []
    for race_id in chunk:
        try:
            race_run = topaz_api.get_race_runs(race_id=race_id)
            race_runs.append(race_run)
            time.sleep(0.3)
            race_result_json = topaz_api.get_race_result(race_id = race_id)
            try:
                race_run.to_feather(f"race_runs/{race_id}_run.fth")
                race_result_df = pd.DataFrame.from_dict([race_result_json])
                race_result_df.to_feather(f"results/{race_id}_results.fth")
            except Exception as e:
                print(e)

        except requests.HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
            if http_err.response.status_code == 429:

                time.sleep(120)
            errors.append(race_id)
            pass
        progress.update()

    return race_runs,race_results,errors

In [None]:
def topaz_race_run_getter(race_id_list,topaz_api:TopazAPI):

    print(f"Fetching data for  {len(race_id_list)}")

    num_workers = 6
    chunk_size = math.ceil(len(race_id_list) / num_workers)

    chunks = [race_id_list[i:i + chunk_size] for i in range(0, len(race_id_list), chunk_size)]
    
    print(chunks)
    print(len(chunks))
    _process_jobs = []
    bars = []
    race_runs = []
    results = []
    errors = []
    for i in range(num_workers):
        bars.append(tqdm(total=len(chunks[i]), position=i)) 
        # time.sleep(2)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:


        for i,chunk in enumerate(chunks):
            _process_jobs.append(executor.submit(topaz_race_runs_threaded, chunk, topaz_api, bars[i]))

        # results = []
        for job in concurrent.futures.as_completed(_process_jobs):
            race_run,result_json,error = job.result()
            race_runs.extend(race_run)
            errors.extend(error)
            results.extend(result_json)

    

    # results = []
    print(errors)
    return race_runs,results,errors

In [None]:
all_races_df = pd.read_csv('all_races_topas.csv', header=0)
i = 0
race_ids = list(all_races_df['raceId'].unique())
# subset_ids = race_ids[i:min(len(race_ids),i+100)]
# race_runs,results,errors = topaz_race_run_getter(subset_ids,topaz_api)

In [None]:
all_races_df

In [None]:
test_race_run_df = pd.read_feather('race_runs/837931333_run.fth')

In [None]:
test_race_results_json_df = pd.read_feather('results/837931333_results.fth')

In [None]:
test_race_results_json_df

In [None]:
for i in range(0,len(race_ids),100):
    subset_ids = race_ids[i:min(len(race_ids),i+1000)]
    race_runs,results,errors = topaz_race_run_getter(subset_ids,topaz_api)

    results_df = pd.DataFrame.from_dict(results)
    all_race_runs = pd.concat(race_runs,ignore_index=True).reset_index(drop=True)
    all_race_runs.to_feather(f'race_runs/{i}_topaz_race_runs.fth')
    results_df.to_feather(f"results/{i}_topaz_results.fth")
    # with 

In [None]:
results

In [None]:
race_ids = list(all_races_df['raceId'].unique())
code = "NSW-VIC"
for race_id in tqdm(race_ids, desc="Processing races", unit="race"):
    result_retries = 10

    while result_retries > 0:
        # Use tqdm to create a progress bar
        # Get race run data
        try:
            race_run = topaz_api.get_race_runs(race_id=race_id)
            race_result_json = topaz_api.get_race_result(race_id=race_id)
            file_path = code + '_DATA.csv'
            file_exists = os.path.isfile(file_path)
            header_param = not file_exists

            race_result = pd.DataFrame.from_dict([race_result_json])
            split_times_df = pd.DataFrame(race_result['splitTimes'].tolist(),index=race_result.index)

            splits_dict = split_times_df.T.stack().to_frame()
            splits_dict.reset_index(drop=True, inplace= True)
            splits_normalised = pd.json_normalize(splits_dict[0])

            if len(splits_normalised) == 0:
                race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
                break

            first_split = splits_normalised[splits_normalised['splitTimeMarker'] == 1]
            first_split = first_split[['runId','position','time']]
            first_split = first_split.rename(columns={'position':'firstSplitPosition','time':'firstSplitTime'})
            second_split = splits_normalised[splits_normalised['splitTimeMarker'] == 2]
            second_split = second_split[['runId','position','time']]
            second_split = second_split.rename(columns={'position':'secondSplitPosition','time':'secondSplitTime'})

            split_times = splits_normalised[['runId']]
            split_times = pd.merge(split_times,first_split,how='left',on=['runId'])
            split_times = pd.merge(split_times,second_split,how='left',on=['runId'])

            race_run = pd.merge(race_run,split_times,how='left',on=['runId'])
            race_run.drop_duplicates(inplace=True)
            race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
            break
        except requests.HTTPError as http_err:
            if http_err.response.status_code == 404:
                file_path = code + '_DATA.csv'
                file_exists = os.path.isfile(file_path)
                header_param = not file_exists
                race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
                break
        except Exception as e:
            print(race_id)
            result_retries -= 1
            if result_retries > 0:
                time.sleep(15)
            else:
                time.sleep(120)

In [None]:
# Iterate over 7-day blocks
for i in range(0, len(date_range), 10):
    start_block_date = date_range[i]
    print(start_block_date)
    end_block_date = date_range[min(i + 9, len(date_range) - 1)]  # Ensure the end date is within the range

    codes = ['NT','VIC','NSW','SA','WA','QLD','TAS','NZ']
    codes = ['VIC', "NSW"]
    all_races = []
    for code in codes:
        
        print(code)
        retries = 10  # Number of retries
        while retries > 0:
            try:
                races = topaz_api.get_races(from_date=start_block_date, to_date=end_block_date, owning_authority_code=code)
                all_races.append(races)
                break  # Break out of the loop if successful
            except requests.HTTPError as http_err:
                if http_err.response.status_code == 429:
                    retries -= 1
                    if retries > 0:
                        print(f"Rate limited. Retrying in 121 seconds...")
                        time.sleep(121)
                    else:
                        print("Max retries reached. Moving to the next block.")
                else:
                    print(f"Error fetching races for {code}: {http_err.response.status_code}")
                    retries -= 1
                    if retries > 0:
                        print(f"Retrying in 30 seconds...")
                        time.sleep(30)
                    else:
                        print("Max retries reached. Moving to the next block.")

    try:
        all_races_df = pd.concat(all_races,ignore_index=True).reset_index(drop=True)
    except ValueError:
        continue

    # Extract unique race IDs
    race_ids = list(all_races_df['raceId'].unique())

    for race_id in tqdm(race_ids, desc="Processing races", unit="race"):
        result_retries = 10

        while result_retries > 0:
            # Use tqdm to create a progress bar
            # Get race run data
            try:
                race_run = topaz_api.get_race_runs(race_id=race_id)
                race_result_json = topaz_api.get_race_result(race_id=race_id)
                file_path = code + '_DATA.csv'
                file_exists = os.path.isfile(file_path)
                header_param = not file_exists

                race_result = pd.DataFrame.from_dict([race_result_json])
                split_times_df = pd.DataFrame(race_result['splitTimes'].tolist(),index=race_result.index)

                splits_dict = split_times_df.T.stack().to_frame()
                splits_dict.reset_index(drop=True, inplace= True)
                splits_normalised = pd.json_normalize(splits_dict[0])

                if len(splits_normalised) == 0:
                    race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
                    break

                first_split = splits_normalised[splits_normalised['splitTimeMarker'] == 1]
                first_split = first_split[['runId','position','time']]
                first_split = first_split.rename(columns={'position':'firstSplitPosition','time':'firstSplitTime'})
                second_split = splits_normalised[splits_normalised['splitTimeMarker'] == 2]
                second_split = second_split[['runId','position','time']]
                second_split = second_split.rename(columns={'position':'secondSplitPosition','time':'secondSplitTime'})

                split_times = splits_normalised[['runId']]
                split_times = pd.merge(split_times,first_split,how='left',on=['runId'])
                split_times = pd.merge(split_times,second_split,how='left',on=['runId'])

                race_run = pd.merge(race_run,split_times,how='left',on=['runId'])
                race_run.drop_duplicates(inplace=True)
                race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
                break
            except requests.HTTPError as http_err:
                if http_err.response.status_code == 404:
                    file_path = code + '_DATA.csv'
                    file_exists = os.path.isfile(file_path)
                    header_param = not file_exists
                    race_run.to_csv(code + '_DATA.csv', mode='a', header=header_param, index=False)
                    break
            except Exception as e:
                print(race_id)
                result_retries -= 1
                if result_retries > 0:
                    time.sleep(15)
                else:
                    time.sleep(120)