In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px #graphing
import plotly.graph_objects as go #graphing
from plotly.subplots import make_subplots #graphing
import plotly.figure_factory as ff #graphing
import matplotlib.pyplot as plt #graphing
import seaborn as sns #graphing
import missingno as msno #describe data
import os

colors = ["#FFFFFF", "#6CD4FF", "#F7DF00", "#E60000"]

plt.rcParams["figure.figsize"] = (12, 8)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Goal of the Competition 🐴 

The goal of this competition is to analyze horse racing tactics, drafting strategies, and path efficiency. You will develop a model using never-before-released coordinate data along with basic race information.

Your work will help racing horse owners, trainers, and veterinarians better understand how equine performance and welfare fit together. With better data analysis, equine welfare could significantly improve.

## Context 🐴

Injury prevention is a critical component in modern athletics. Sports that involve animals, such as horse racing, are no different than human sport. Typically, efficiency in movement correlates to both improvements in performance and injury prevention.

A wealth of data is now collected, including measures for heart rate, EKG, longitudinal movement, dorsal/ventral movement, medial/lateral deviation, total power and total landing vibration. Your data science skills and analysis are needed to decipher what makes the most positive impact.

In this competition, you will create a model to interpret one aspect of this new data. You’ll be among the first to access X/Y coordinate mapping of horses during races. Using the data, you might analyze jockey decision making, compare race surfaces, or measure the relative importance of drafting. With considerable data, contestants can flex their creativity problem solving skills.

The New York Racing Association (NYRA) and the New York Thoroughbred Horsemen's Association (NYTHA) conduct world class thoroughbred racing at Aqueduct Racetrack, Belmont Park and Saratoga Race Course.

With your help, NYRA and NYTHA will better understand their vast data set, which could lead to new ways of racing and training in a highly traditional industry. With improved use of horse tracking data, you could help improve equine welfare, performance and rider decision making.

In [None]:
nyra_tracking = pd.read_csv("/kaggle/input/big-data-derby-2022/nyra_tracking_table.csv")
nyra_start = pd.read_csv("/kaggle/input/big-data-derby-2022/nyra_start_table.csv")
nyra_race = pd.read_csv("/kaggle/input/big-data-derby-2022/nyra_race_table.csv")
nyra_2019 = pd.read_csv("/kaggle/input/big-data-derby-2022/nyra_2019_complete.csv")

In [None]:
nyra_tracking.head()

In [None]:
nyra_tracking.describe().style.background_gradient(cmap = "Purples")

In [None]:
nyra_tracking.track_id.value_counts()

In [None]:
nyra_tracking.race_number.value_counts()

In [None]:
nyra_tracking.program_number.value_counts()

In [None]:
nyra_tracking['race_date'] = pd.to_datetime(nyra_tracking['race_date'])
nyra_tracking['day'], nyra_tracking['month'] = nyra_tracking['race_date'].dt.day, nyra_tracking['race_date'].dt.month

# track_id = AQU

In [None]:
nyra_tracking0 = nyra_tracking[nyra_tracking["track_id"] == "AQU"]
nyra_tracking1 = nyra_tracking0[nyra_tracking0["month"] == 11]

plt.style.use("dark_background")
plt.figure(figsize = (16, 8))
sns.scatterplot(data = nyra_tracking1, x = "longitude", y = "latitude", hue = "race_number", palette = "Set2")

plt.title("track_id = AQU, Longitude and Latitude by Race Number")
plt.legend()

# track_id = BEL

In [None]:
nyra_tracking0 = nyra_tracking[nyra_tracking["track_id"] == "BEL"]
nyra_tracking1 = nyra_tracking0[nyra_tracking0["month"] == 5]

plt.style.use("dark_background")
plt.figure(figsize = (16, 8))
sns.scatterplot(data = nyra_tracking1, x = "longitude", y = "latitude", hue = "race_number", palette = "Set2")

plt.title("track_id = BEL, Longitude and Latitude by Race Number")
plt.legend()

# track_id = SAR

In [None]:
nyra_tracking0 = nyra_tracking[nyra_tracking["track_id"] == "SAR"]
nyra_tracking1 = nyra_tracking0[nyra_tracking0["month"] == 9]

plt.style.use("dark_background")
plt.figure(figsize = (16, 8))
sns.scatterplot(data = nyra_tracking1, x = "longitude", y = "latitude", hue = "race_number", palette = "Set2")

plt.title("track_id = SAR, Longitude and Latitude by Race Number")
plt.legend()

In [None]:
nyra_start.head()

In [None]:
nyra_start['race_date'] = pd.to_datetime(nyra_start['race_date'])
nyra_start['day'], nyra_start['month'] = nyra_start['race_date'].dt.day, nyra_start['race_date'].dt.month

# Odds of winning race 🐴

In [None]:
fig = px.scatter(nyra_start, x = "weight_carried", y = "odds", hover_data = ["race_number", "jockey"],
                 color = "odds", color_continuous_scale = colors, range_color = (0, 9999),
                 title = "Odds by Weight Carried")

fig.update_traces(marker = dict(size = 8, symbol = "circle")) # scaling the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 20))
fig.show()

In [None]:
fig = px.scatter(nyra_start, x = "weight_carried", y = "day", hover_data = ["race_number", "jockey"],
                 color = "odds", color_continuous_scale = colors, range_color = (0, 9999),
                 title = "Odds by Day of Event and Weight Carried")

fig.update_traces(marker = dict(size = 9, symbol = "square")) # scaling the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 20))
fig.show()

In [None]:
fig = px.scatter(nyra_start, x = "weight_carried", y = "month", hover_data = ["race_number", "jockey"],
                 color = "odds", color_continuous_scale = colors, range_color = (0, 9999),
                 title = "Odds by Month of Event and Weight Carried")

fig.update_traces(marker = dict(size = 11, symbol = "circle")) # scaling the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 20))
fig.show()

In [None]:
nyra_race.head()

In [None]:
nyra_race.track_condition.value_counts()

In [None]:
fig = px.scatter(nyra_race, x = "post_time", y = "run_up_distance", hover_data = ["race_number", "course_type"],
                 color = "race_number", color_discrete_sequence = colors, #range_color = (0, 9999),
                 title = "Post Time by Run Up Distance")

fig.update_traces(marker = dict(size = 4, symbol = "circle")) # scaling the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 20))
fig.show()

**Thank you for viewing this notebook. Please feel free to provide any feedback**