In [16]:
from app.ml.clickhouse_pd import get_complete_race_df
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier, Pool, metrics, cv
from catboost.datasets import amazon

train_df, test_df = get_complete_race_df()





In [19]:
test_df.query("horse_id.isna()")

Unnamed: 0,horse_id,race_id,last_raced_date,last_raced_days_since,last_raced_track,last_raced_track_canonical,last_raced_track_state,last_raced_track_country,last_raced_track_name,last_raced_number,...,wind_direction,post_time,start_comments,timer,dead_heat,number_of_runners,final_time,final_millis,total_wps_pool,footnotes


In [20]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)
drop_keys = [
    "winner",
    "finish_position",
    "wagering_position",
    "official_position",
    "footnotes",
    "final_time",
    "final_millis",
    "comments",
    'last_raced_date',
    'last_raced_track',
    'last_raced_track_canonical',
    'last_raced_track_state',
    'last_raced_track_country',
    'last_raced_track_name',
    'last_raced_number',
    'last_raced_position',
    'entry_program',
    'jockey_allowance',
    'claimed',
    'new_trainer_name',
    'new_owner_name',
    'position_dead_heat',
    'choice',
    'track',
    'track_state',
    'track_name',
    'breed',
    'type',
    'race_name',
    'grade',
    'black_type',
    'conditions',
    'min_claim',
    'max_claim',
    'restrictions',
    'min_age',
    'max_age',
    'sexes',
    'exact',
    'run_up',
    'temp_rail',
    'course',
    'track_condition',
    'scheduled_surface',
    'scheduled_course',
    'off_turf',
    'format',
    'track_record_holder',
    'track_record_time',
    'track_record_millis',
    'track_record_date',
    'purse',
    'purse_text',
    'available_money',
    'purse_enhancements',
    'value_of_race',
    'weather',
    'wind_speed',
    'wind_direction',
    'post_time',
    'start_comments',
    'timer',
    'dead_heat',
    'total_wps_pool',
    'state_bred',
    'breeder'
]

test_df = test_df.drop(
    drop_keys,
    axis=1,
)

x = train_df.drop(drop_keys, axis=1)
y = train_df.winner



In [15]:
test_df.query("code == -999")
# test_df.head()

Unnamed: 0,horse_id,race_id,last_raced_days_since,program,entry,horse,jockey_first,jockey_last,trainer_first,trainer_last,...,code,age_code,sexes_code,female_only,distance_text,distance_compact,feet,furlongs,surface,number_of_runners


In [21]:
categorical_features_indices = np.where(x.dtypes != float)[0]

x_train, x_validation, y_train, y_validation = train_test_split(x, y, train_size=0.75)

x_test = test_df

model = CatBoostClassifier(
  custom_loss=[metrics.Accuracy()],
  logging_level='Silent'
)

train_pool = Pool(x_train, y_train, cat_features=categorical_features_indices)


model.fit(
  train_pool,
  eval_set=(x_validation, y_validation),
  # logging_level='Verbose',
  plot=True
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f9cc68f0f70>

In [22]:
cv_params = model.get_params()
cv_params.update({
  'loss_function': metrics.Logloss(),
})

cv_data = cv(
  Pool(x, y, cat_features=categorical_features_indices),
  cv_params,
  plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric=metrics.Accuracy(),
        verbose=False,
        loss_function=metrics.Logloss(),
    )
    
    cv_data = cv(
        Pool(x, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

from numpy.random import default_rng

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=default_rng(123)
)

print(best)

 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 47/50 [36:39<02:21, 47.11s/trial, best loss: 0.07749133122267449]

In [25]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))
from sklearn.metrics import accuracy_score


model = CatBoostClassifier(
    # l2_leaf_reg=int(best['l2_leaf_reg']),
    # learning_rate=best['learning_rate'],
    iterations=500,
    od_type='Iter',
    od_wait=40,
    eval_metric=metrics.Accuracy(),
    verbose=False,
    loss_function=metrics.Logloss(),
)

# cv_data = cv(Pool(x, y, cat_features=categorical_features_indices), model.get_params())

validate_pool = Pool(x_validation, y_validation, cat_features=categorical_features_indices)

model.fit(x, y, cat_features=categorical_features_indices, eval_set=validate_pool, plot=True)


print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(x_validation))
))

Precise validation accuracy score: 0.9798167915696315


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Simple model tree count: 1
Simple model validation accuracy: 0.9748


In [26]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = x_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

odds: 99.53412516020354
where_bred: 0.46587483979645883
weight: 0.0
trainer_last: 0.0
trainer_first: 0.0
track_country: 0.0
track_canonical: 0.0
surface: 0.0
sire: 0.0
sexes_code: 0.0
sex: 0.0
race_id: 0.0
program: 0.0
pp: 0.0
owner: 0.0
number_of_runners: 0.0
number: 0.0
medication_equipment: 0.0
last_raced_days_since: 0.0
jockey_last: 0.0
jockey_first: 0.0
horse_id: 0.0
horse: 0.0
furlongs: 0.0
female_only: 0.0
feet: 0.0
favorite: 0.0
entry: 0.0
dob: 0.0
distance_text: 0.0
distance_compact: 0.0
disqualified: 0.0
date: 0.0
dam: 0.0
color: 0.0
code: 0.0
claim_price: 0.0
age_code: 0.0


In [27]:
# print(train_df[['winner', 'horse']])

def get_race_predicted_top_3(race_id):
    race_horses = x_test.loc[x_test['race_id'] == race_id]
    race_results = model.predict_proba(race_horses)
        
    race_id_numpy = race_horses['race_id'].to_numpy()
    race_horses_numpy = race_horses['horse'].to_numpy()
              
    winner_probs = race_results[:,1]
    df = pd.DataFrame()
    df['horse'] = race_horses_numpy
    df['win_prob'] = winner_probs    

    return df.sort_values(by='win_prob', ascending=False).head(n=3)
    
    
race_ids = set(list(x_test['race_id'].to_numpy()))

# print(race_ids)

import app.lib.clickhouse2pandas as ch2pd
from app.core.config import settings

connection_url = settings.CLICKHOUSE_URI
    
all_starters = ch2pd.select(connection_url, f"SELECT * FROM race_db.starters")
race_winners = all_starters.loc[all_starters['winner'] == True]

def get_top_3_by_odds(race_id):
    in_race = all_starters.loc[all_starters['race_id'] == race_id]
    return in_race.sort_values(by='odds', ascending=False).head(n=3)
    
def get_race_winner_actual(race_id):
    # print(race_winners.head(n=500))
    res = race_winners.loc[race_winners['race_id'] == race_id][['horse']].to_numpy()
    
    if res.size == 0:
        return None
    
    return res[0][0]

def get_stats_predicted():
    total_races = 0
    in_top_3 = 0
    in_top_1 = 0
    no_match = 0
    err = 0

    for race_id in race_ids:
        total_races += 1

        res = get_race_predicted_top_3(race_id)
        actual = get_race_winner_actual(race_id)

        if actual is None:
            err += 1
            continue

        try:
            found = res.loc[res['horse'] == actual]

            if len(found.index) == 0:
                no_match += 1
                continue

            # print(res[['horse']].to_numpy()[:,0])

            realind = res[['horse']].to_numpy()[:,0].tolist().index(actual)


            # print(len(index))   

            if realind == 0:
                in_top_1 += 1
            else:
                in_top_3 += 1
        except ValueError:
            no_match += 1

    print(total_races, in_top_3, in_top_1, no_match, err)
        
def get_stats_by_odds():
    total_races = 0
    in_top_3 = 0
    in_top_1 = 0
    no_match = 0
    err = 0

    for race_id in race_ids:
        total_races += 1

        res = get_top_3_by_odds(race_id)
        actual = get_race_winner_actual(race_id)

        if actual is None:
            err += 1
            continue

        try:
            found = res.loc[res['horse'] == actual]

            if len(found.index) == 0:
                no_match += 1
                continue

            # print(res[['horse']].to_numpy()[:,0])

            realind = res[['horse']].to_numpy()[:,0].tolist().index(actual)


            # print(len(index))   

            if realind == 0:
                in_top_1 += 1
            else:
                in_top_3 += 1
        except ValueError:
            no_match += 1

    print(total_races, in_top_3, in_top_1, no_match, err)
        

get_stats_by_odds()
get_stats_predicted()
    

# get_top_3_by_odds('faba4165-8f2d-47e8-9de7-74d04490b46f')[['horse', 'odds']]
    
# print(get_race_winner_actual('faba4165-8f2d-47e8-9de7-74d04490b46f'))

# get_race_predicted_top_3('faba4165-8f2d-47e8-9de7-74d04490b46f')


1884 256 63 1565 0
1884 5 1869 10 0


In [24]:
model.save_model("race_model2.dump")

In [86]:
submission = pd.DataFrame()
submission['horse'] = x_test['horse']
submission['winner'] = model.predict_proba(x_test)

print('submission', submission)

submission.to_csv('submission.csv', index=False)

ValueError: Wrong number of items passed 2, placement implies 1