In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress


# horse racing files 
race_details_path = "Resources/race_details_20240101_20240131.csv"
race_results_path = "Resources/race_results_20240101_20240131.csv"

# Read both datasets
race_details = pd.read_csv(race_details_path)
race_results = pd.read_csv(race_results_path)

# Combine the data into a single DataFrame

# Merge the datasets on the columns 'race_date', 'race_city', and 'race_no'
combined = pd.merge(race_details, race_results, on=['race_date', 'race_city', 'race_no'])

# Save the merged DataFrame to a new CSV file
combined.to_csv('path_to_combined_dataset.csv', index=False)

# Display the data table for preview

combined.head()

Unnamed: 0,race_date,race_city,race_no,race_type,race_sex_group,race_age_group,race_race_group,race_length,race_track_type,race_track_condition,...,horse_owner,horse_trainer,horse_race_degree,horse_win_value,horse_psf_rate,horse_psf_rank,Horse_starting_box_no,horse_margin,horse_late_start,horse_rate
0,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,NİMET ARİF KURTEL,TAMER TURAN,1.31.26,1.5,49,1,1,9 Lengths,,42
1,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,AHMET KURT,TUNCAY PELEN,1.33.05,3.75,20,2,4,Half Lengths,,43
2,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,BÜLENT BAYBURAN,KAZIM ŞENGEL,1.33.17,28.7,3,6,3,3 Lengths,,33
3,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,DİLEK ÖKTEN,ALİ RIZA AYDIN,1.33.65,14.85,3,5,6,4 Lengths,,39
4,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,SEMİH KATI,MURAT TÜRKOĞLU,1.34.30,4.55,17,3,5,,,51


In [27]:
# Save the cleaned DataFrame to a new CSV file
clean.to_csv('clean.csv', index=False)

In [14]:
#data cleaning
#columns to drop: horse_origin, 
clean = combined.drop(['horse_origin'], axis=1)

#columns to clean: horse_age(drop 'y')

clean['horse_age'] = df['horse_age'].str.replace('y', '', regex=False)
clean

Unnamed: 0,race_date,race_city,race_no,race_type,race_sex_group,race_age_group,race_race_group,race_length,race_track_type,race_track_condition,...,horse_owner,horse_trainer,horse_race_degree,horse_win_value,horse_psf_rate,horse_psf_rank,Horse_starting_box_no,horse_margin,horse_late_start,horse_rate
0,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,NİMET ARİF KURTEL,TAMER TURAN,1.31.26,1.50,49,1,1,9 Lengths,,42
1,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,AHMET KURT,TUNCAY PELEN,1.33.05,3.75,20,2,4,Half Lengths,,43
2,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,BÜLENT BAYBURAN,KAZIM ŞENGEL,1.33.17,28.70,3,6,3,3 Lengths,,33
3,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,DİLEK ÖKTEN,ALİ RIZA AYDIN,1.33.65,14.85,3,5,6,4 Lengths,,39
4,2024-01-01,Bursa,1,Condition 2,Undefined,3 Years Old,English,1400,Dirt,Good Going,...,SEMİH KATI,MURAT TÜRKOĞLU,1.34.30,4.55,17,3,5,,,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4278,2024-01-31,Şanlıurfa,8,Handicap 14,Undefined,4 Years Old,Arabian,1700,Dirt,Good Going,...,MEHMET ŞAH ATAKAYA,YELİZ BAYSAL,2.17.56,20.50,3,11,7,,,42
4279,2024-01-31,Şanlıurfa,8,Handicap 14,Undefined,4 Years Old,Arabian,1700,Dirt,Good Going,...,MAHMUT TEKİN,MAHSUM BARCİN,2.17.92,34.90,2,14,8,,,32
4280,2024-01-31,Şanlıurfa,8,Handicap 14,Undefined,4 Years Old,Arabian,1700,Dirt,Good Going,...,ALİ POLAT,MURAT ALTIN,2.18.09,9.45,5,7,6,,,34
4281,2024-01-31,Şanlıurfa,8,Handicap 14,Undefined,4 Years Old,Arabian,1700,Dirt,Good Going,...,HÜSEYİN ÇELİK,MEHMET DEMİROL,2.20.75,24.35,2,15,9,,,42


In [26]:
#Check unique counts for each column to explore the perspectives we should be analyzing

for column in clean.columns:
    unique_value_count = clean[column].nunique()
    print(f"Unique value count in {column}: {unique_value_count}")

Unique value count in race_date: 31
Unique value count in race_city: 7
Unique value count in race_no: 9
Unique value count in race_type: 20
Unique value count in race_sex_group: 2
Unique value count in race_age_group: 3
Unique value count in race_race_group: 2
Unique value count in race_length: 11
Unique value count in race_track_type: 2
Unique value count in race_track_condition: 4
Unique value count in best_rating: 101
Unique value count in result: 19
Unique value count in horse_name: 2468
Unique value count in horse_accessories: 61
Unique value count in horse_age: 9
Unique value count in horse_sex: 2
Unique value count in horse_sire: 397
Unique value count in horse_dam: 2149
Unique value count in hors_broodmare_sire: 584
Unique value count in jockey_weight: 15
Unique value count in jockey_name: 189
Unique value count in horse_owner: 1422
Unique value count in horse_trainer: 476
Unique value count in horse_race_degree: 3169
Unique value count in horse_win_value: 1093
Unique value cou