In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [87]:
# import data
df = pd.read_csv('../Dog Racing Info/Runners_2015.csv')
df.shape

(703829, 91)

## 'Fin' is our target column

In [88]:
df = df[df['Fin'] != 0]   #0 is not a valid result so I'm remvoing it
#we only care about top 3, so we will change the classification to 1st, 2nd, 3rd, or 4th+
df['Fin'] = df['Fin'].apply(lambda x: 0 if x > 3 else x)

### Cleaning up Null values

In [89]:
#Drop columns that are unecessary or have too many nulls
df.drop(columns=['Dog', 'Wwt', 'By1', 'By2', 'Nrm', 'Ntm', 'Dhf', 'Swt', 'Tag', 'Ptk', 'Scr', 'Fby', 'Col', 'Rem',
                 'Ctk', 'Com', 'Ar1', 'Ar2', 'Ar3', 'Ar4'], inplace=True)
#probably find a way to do something with com column too
#probably can find a way to include Ar1-Ar4

In [90]:
df['Fav'].fillna(0, inplace=True)
df['Fav'] = df['Fav'].apply(lambda x: 1 if x == '*' else x)   #make Fav column binary

In [91]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [92]:
df.shape

(489354, 71)

### Cleaning up values and datatypes

In [93]:
df['Trk'] = df['Trk'].replace('`F', 'F')
df = df[(df['Trk']=='F') | (df['Trk']=='M') | (df['Trk']=='S')]   #only keep F, M and S values for Track conditions

In [94]:
fix_grd_values_dict = {'C!':'C', 'E!':'E', 'SC!':'SC', 'scl':'Scl'}
df['Grd'] = df['Grd'].apply(lambda x: fix_grd_values_dict[x] if x in fix_grd_values_dict.keys() else x)   #clean up 

In [95]:
df['Odd'] = df['Odd'].str.replace('\*.', '', regex=True)
df = df[(df['Odd'] != '-----') & (df['Odd'] != '-1.00')]
df['Odd'] = df['Odd'].str.replace('\.$', '', regex=True)
df['Odd'] = df['Odd'].str.replace('\.\.', '.', regex=True)
#remove remaining 6 messed up strings
removed_odds = [x for x in pd.DataFrame(df['Odd'].value_counts()).index if x.count('.') > 1]  #if it has more than 1 decimal its incorrect
df = df[~df['Odd'].isin(removed_odds)]
df['Odd'] = df['Odd'].astype(float)
df['Odd'] = df['Odd'].round(2)

In [96]:
df['Mln'] = df['Mln'].apply(lambda x: x + '-1' if x.count('-') < 1 else x)
half_values_dict = {'5/2-1':'2.5-1', '7/2-1':'3.5-1', '9/2-1':'4.5-1'}
df['Mln'] = df['Mln'].apply(lambda x: half_values_dict[x] if x.count('/') == 1 else x)
df['Mln'] = df['Mln'].apply(lambda x: float(x.split('-')[0]) / float(x.split('-')[1]) + 1)

In [97]:
convert_dst_values_dict = {'5-16':'503', '3-16':'301', '3-8':'603', 'DC':'0', '7-16':'703', 'YARD':'0'}
df['Dst'] = df['Dst'].apply(lambda x: convert_dst_values_dict[x] if x in convert_dst_values_dict.keys() else x)
df['Dst'] = df['Dst'].astype(int)
df = df[df['Dst'] != 0]

In [98]:
df['Dat'] = pd.to_datetime(df['Dat'])

In [99]:
df['Twt'] = df['Twt'].str.replace('\.$', '', regex=True)
df['Twt'] = df['Twt'].str.replace('^0', '', regex=True)
df = df[~df['Twt'].isin(['', '557', '5½', '7'])]
convert_twt_values_dict = {'056':'56', '059':'59', '071':'71', '600':'60', '77.7':'77'}
df['Twt'] = df['Twt'].apply(lambda x: convert_twt_values_dict[x] if x in convert_twt_values_dict.keys() else x)
df['Twt'] = df['Twt'].apply(lambda x: float(x[:-1]) if '½' in x else float(x))

In [100]:
df['Gen'] = np.where(df['Gen'] == 'M', 1 ,0)

In [101]:
df['Wlp'] = pd.to_datetime(df['Wlp']) #subtract from date of race to create age at time of race

In [102]:
df = df[df['Hgd'] != '.']

In [103]:
df['Age'] = df['Dat'] - df['Wlp']
df.drop(columns=['Wlp'], inplace=True)
df['Age'] = df['Age'].dt.days

### EDA

In [104]:
#function to remove values from columns that have less than X instances
def remove_low_occurences(df, cols, limit):
    for col in cols:
        values_to_remove = pd.DataFrame(df[col].value_counts())[pd.DataFrame(df[col].value_counts())[col] < limit].index
        new_df = df[~df[col].isin(values_to_remove)]
    return new_df
df = remove_low_occurences(df, ['Grd'], 5000)

In [105]:
#remove Own, Dam, Ken, Trn
df.drop(columns=['Own', 'Dam', 'Ken', 'Trn'], inplace=True)

In [106]:
#Own has too many observations
#Sir was positive up to 50% from 45%, will keep that one
#Dam has too many observations
#Ken was way negative down to 18% from 45%
#Trn was way negative down to 19% from 45%

### Group DataFrame by RaceId

In [107]:
# create unique identifier for each race. I'm going to use dat plus rac columns
def date_race(col1, col2, date_col, col3):
    return str(col1) + str(col2) + date_col.strftime('%Y%m%d') + str(col3)
df['Rid'] = df.apply(lambda x: date_race(x.Sig, x.Tid, x.Dat, x.Rac), axis=1)

In [118]:
#get all Rid that have a value count of 8
rid_df = pd.DataFrame(df['Rid'].value_counts())
rid_vals = rid_df[rid_df['Rid'] == 8].index   #list of all Rid columns that have a value count of 8
df = df[df['Rid'].isin(rid_vals)]

In [121]:
def group_by_rid(x):
    non_groupby_cols = [col for col in df.columns if col != 'Rid']
    row_info = []
    for col in non_groupby_cols:
        row_info.append(','.join(x[col].astype(str).values))
    return ','.join(row_info)

In [126]:
df = df.groupby('Rid').apply(group_by_rid).to_frame('new_col')

KeyboardInterrupt: 

In [None]:
df = df.new_col.str.split(',',expand=True)

In [None]:
df.head()

In [125]:
#get ELO rating of some kind

In [None]:
#use tensor flow

### Save for modeling later

In [22]:
df.to_csv('../model_ready_2015.csv', index=False)

### Basic Modeling

In [82]:
X = pd.get_dummies(df.drop(columns='Fin'), drop_first=True)
y = df[['Fin']]

In [83]:
X.shape

(478034, 716)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=7, stratify=y)
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [25]:
lr = LogisticRegression()
lr.fit(Xs_train, y_train)

LogisticRegression()

In [26]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.5044961045714133, 0.5054493727817253)

In [27]:
# lr.score(X_train, y_train), lr.score(X_test, y_test) #with 'Own', 'Sir', 'Dam', 'Ken', 'Trn' removed
# (0.452612157294251, 0.45277532560445494)