In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import data
df = pd.read_csv('../Dog Racing Info/Runners_2015.csv')
df.shape

(703829, 91)

## 'Fin' is our target column

In [3]:
df = df[df['Fin'] != 0]   #0 is not a valid result so I'm remvoing it
#we only care about top 3, so we will change the classification to 1st, 2nd, 3rd, or 4th+
df['Fin'] = df['Fin'].apply(lambda x: 0 if x > 3 else x)

### Cleaning up Null values

In [4]:
#Drop columns that are unecessary or have too many nulls
df.drop(columns=['Dog', 'Wwt', 'By1', 'By2', 'Nrm', 'Ntm', 'Dhf', 'Swt', 'Tag', 'Ptk', 'Scr', 'Fby', 'Col', 'Rem',
                 'Ctk', 'Com', 'Ar1', 'Ar2', 'Ar3', 'Ar4'], inplace=True)
#probably find a way to do something with com column too
#probably can find a way to include Ar1-Ar4

In [5]:
df['Fav'].fillna(0, inplace=True)
df['Fav'] = df['Fav'].apply(lambda x: 1 if x == '*' else x)   #make Fav column binary

In [6]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [7]:
df.shape

(489354, 71)

### Cleaning up values and datatypes

In [8]:
df['Trk'] = df['Trk'].replace('`F', 'F')
df = df[(df['Trk']=='F') | (df['Trk']=='M') | (df['Trk']=='S')]   #only keep F, M and S values for Track conditions

In [9]:
fix_grd_values_dict = {'C!':'C', 'E!':'E', 'SC!':'SC', 'scl':'Scl'}
df['Grd'] = df['Grd'].apply(lambda x: fix_grd_values_dict[x] if x in fix_grd_values_dict.keys() else x)   #clean up 

In [10]:
df['Odd'] = df['Odd'].str.replace('\*.', '', regex=True)
df = df[(df['Odd'] != '-----') & (df['Odd'] != '-1.00')]
df['Odd'] = df['Odd'].str.replace('\.$', '', regex=True)
df['Odd'] = df['Odd'].str.replace('\.\.', '.', regex=True)
#remove remaining 6 messed up strings
removed_odds = [x for x in pd.DataFrame(df['Odd'].value_counts()).index if x.count('.') > 1]  #if it has more than 1 decimal its incorrect
df = df[~df['Odd'].isin(removed_odds)]
df['Odd'] = df['Odd'].astype(float)
df['Odd'] = df['Odd'].round(2)

In [11]:
df['Mln'] = df['Mln'].apply(lambda x: x + '-1' if x.count('-') < 1 else x)
half_values_dict = {'5/2-1':'2.5-1', '7/2-1':'3.5-1', '9/2-1':'4.5-1'}
df['Mln'] = df['Mln'].apply(lambda x: half_values_dict[x] if x.count('/') == 1 else x)
df['Mln'] = df['Mln'].apply(lambda x: float(x.split('-')[0]) / float(x.split('-')[1]) + 1)

In [12]:
convert_dst_values_dict = {'5-16':'503', '3-16':'301', '3-8':'603', 'DC':'0', '7-16':'703', 'YARD':'0'}
df['Dst'] = df['Dst'].apply(lambda x: convert_dst_values_dict[x] if x in convert_dst_values_dict.keys() else x)
df['Dst'] = df['Dst'].astype(int)
df = df[df['Dst'] != 0]

In [13]:
df['Dat'] = pd.to_datetime(df['Dat'])

In [14]:
df['Twt'] = df['Twt'].str.replace('\.$', '', regex=True)
df['Twt'] = df['Twt'].str.replace('^0', '', regex=True)
df = df[~df['Twt'].isin(['', '557', '5½', '7'])]
convert_twt_values_dict = {'056':'56', '059':'59', '071':'71', '600':'60', '77.7':'77'}
df['Twt'] = df['Twt'].apply(lambda x: convert_twt_values_dict[x] if x in convert_twt_values_dict.keys() else x)
df['Twt'] = df['Twt'].apply(lambda x: float(x[:-1]) if '½' in x else float(x))

In [15]:
df['Gen'] = np.where(df['Gen'] == 'M', 1 ,0)

In [16]:
df['Wlp'] = pd.to_datetime(df['Wlp']) #subtract from date of race to create age at time of race

In [17]:
df = df[df['Hgd'] != '.']

In [18]:
df['Age'] = df['Dat'] - df['Wlp']
df.drop(columns=['Wlp'], inplace=True)
df['Age'] = df['Age'].dt.days

### EDA

In [19]:
#function to remove values from columns that have less than X instances
def remove_low_occurences(df, cols, limit):
    for col in cols:
        values_to_remove = pd.DataFrame(df[col].value_counts())[pd.DataFrame(df[col].value_counts())[col] < limit].index
        new_df = df[~df[col].isin(values_to_remove)]
    return new_df
df = remove_low_occurences(df, ['Grd'], 5000)

In [20]:
#remove Own, Dam, Ken, Trn
df.drop(columns=['Own', 'Dam', 'Ken', 'Trn'], inplace=True)

In [21]:
#Own has too many observations
#Sir was positive up to 50% from 45%, will keep that one
#Dam has too many observations
#Ken was way negative down to 18% from 45%
#Trn was way negative down to 19% from 45%

### Group DataFrame by RaceId

In [22]:
# create unique identifier for each race. I'm going to use dat plus rac columns
def date_race(col1, col2, date_col, col3):
    return str(col1) + str(col2) + date_col.strftime('%Y%m%d') + str(col3)
df['Rid'] = df.apply(lambda x: date_race(x.Sig, x.Tid, x.Dat, x.Rac), axis=1)

In [23]:
#get all Rid that have a value count of 8
rid_df = pd.DataFrame(df['Rid'].value_counts())
rid_vals = rid_df[rid_df['Rid'] == 8].index   #list of all Rid columns that have a value count of 8
df = df[df['Rid'].isin(rid_vals)]

In [24]:
def group_by_rid(x):
    non_groupby_cols = [col for col in df.columns if col != 'Rid']
    row_info = []
    for col in non_groupby_cols:
        row_info.append(','.join(x[col].astype(str).values))
    return ','.join(row_info)

In [25]:
df = df.groupby('Rid').apply(group_by_rid).to_frame('new_col')

In [26]:
df = df.new_col.str.split(',',expand=True)

In [27]:
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,526,527,528,529,530,531,532,533,534,535
Rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABM201501128,BM,BM,BM,BM,BM,BM,BM,BM,2015-01-12,2015-01-12,...,0,0,986,802,863,986,1199,1229,1016,833
ABM201502233,BM,BM,BM,BM,BM,BM,BM,BM,2015-02-23,2015-02-23,...,0,0,663,783,814,724,693,1180,936,814
ABM201507015,BM,BM,BM,BM,BM,BM,BM,BM,2015-07-01,2015-07-01,...,0,0,942,852,972,791,821,1125,852,699
ABM2015090715,BM,BM,BM,BM,BM,BM,BM,BM,2015-09-07,2015-09-07,...,0,0,583,1010,1071,555,859,889,948,1529
ABM201509236,BM,BM,BM,BM,BM,BM,BM,BM,2015-09-23,2015-09-23,...,0,0,661,814,814,844,752,1117,571,814


In [28]:
# df.to_csv('combined_dogs_in_races_data.csv')

In [106]:
df = pd.read_csv('combined_dogs_in_races_data.csv')

In [107]:
indexes = []
for x in range (0, 535, 8):
    indexes.append([x for x in range(x, x+8)])

In [81]:
test_df = df[df.index==0]  #only want to iterate through the first row
for index, row in test_df.iterrows():
    for cols in indexes:
        if row[str(cols[0])] == row[str(cols[1])] == row[str(cols[2])] == row[str(cols[3])] == row[str(cols[4])] == row[str(cols[5])] == row[str(cols[6])] == row[str(cols[7])]:
            drop = [str(x) for x in cols][1:]
            df.drop(columns=drop, inplace=True)

In [121]:
for index, row in df.iterrows():
    for cols in indexes:
        for x, y in enumerate(range(248,256)):
            df.iloc[index][str(cols[x])] = str(row[str(cols[x])]) + ' ' + str(row[str(y)])
        
        if index == df.index[-1]:
            if row[str(cols[0])] == row[str(cols[1])] == row[str(cols[2])] == row[str(cols[3])] == row[str(cols[4])] == row[str(cols[5])] == row[str(cols[6])] == row[str(cols[7])]:
                drop = [str(x) for x in cols][1:]
                df.drop(columns=drop, inplace=True)

KeyboardInterrupt: 

In [115]:
df.head() 

Unnamed: 0,Rid,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535
0,ABM201501128,BM,BM,BM,BM,BM,BM,BM,BM,2015-01-12,2015-01-12,2015-01-12,2015-01-12,2015-01-12,2015-01-12,2015-01-12,2015-01-12,A,A,A,A,A,A,A,A,8,8,8,8,8,8,8,8,17.7,17.7,17.7,17.7,17.7,17.7,17.7,17.7,S,S,S,S,S,S,S,S,67.0,74.0,60.0,57.0,54.0,79.0,62.0,60.0,0,0,0,0,0,0,0,0,6,4,5,7,8,1,2,3,4,5,2,8,7,3,1,6,5,4,3,7,8,2,1,6,7,2,4,6,8,1,3,5,0,2,0,0,0,1,3,0,18.43,17.87,18.33,18.39,18.45,17.7,17.99,18.34,0,0,0,0,0,0,0,0,9.7,4.5,31.1,15.0,19.0,0.3,9.7,15.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,A,A,A,A,A,A,A,A,301,301,301,301,301,301,301,301,8,8,8,8,8,8,8,8,5.5,3.5,7.0,11.0,11.0,4.5,9.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,B's O'Grady,RA Blazer,Rhythmless,Kiowa WW Brother,Kiowa Mon Manny,CTW Plaza Rio,Kiowa Sweet Trey,Rico's Vintage,3,2,3,1,2,1,2,2,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,80,20,18,57,107,31,82,54,13,4,3,8,10,7,15,10,8,7,2,9,12,4,7,4,8,6,3,7,19,3,9,7,10,3,5,6,18,0,6,8,0.0,17.66,17.71,0.0,17.44,17.37,17.49,0.0,S,A,A,A,S,A,A,A,D,D,D,D,D,D,D,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607,596,601,598,558,475,559,601,307,300,305,300,300,299,301,299,312,288,296,300,288,316,304,292,0,0,0,0,0,0,0,0,1227,1184,1202,1199,1146,1091,1165,1193,5.9,5.3,4.5,6.1,5.3,4.9,6.5,5.9,8.0,8.0,5.9,7.9,8.0,8.0,7.6,8.0,R6,R5,R4,R6,R5,R5,R6,R6,E8,E8,E5,E7,E8,E8,E7,E8,78,72,74,75,72,79,76,73,71,71,73,71,71,71,71,71,6,4,5,7,8,1,2,3,342,333,339,334,333,333,335,332,6,4,5,7,8,5,3,6,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,986,802,863,986,1199,1229,1016,833
1,ABM201502233,BM,BM,BM,BM,BM,BM,BM,BM,2015-02-23,2015-02-23,2015-02-23,2015-02-23,2015-02-23,2015-02-23,2015-02-23,2015-02-23,A,A,A,A,A,A,A,A,3,3,3,3,3,3,3,3,39.69,39.69,39.69,39.69,39.69,39.69,39.69,39.69,F,F,F,F,F,F,F,F,72.0,70.0,74.0,60.0,67.0,73.0,50.0,61.0,0,0,0,0,0,0,0,0,4,5,6,7,8,3,1,2,8,5,7,4,1,3,2,6,6,5,4,3,1,8,2,7,5,6,2,3,1,8,4,7,3,0,2,0,1,0,0,0,40.11,40.19,39.7,40.12,39.69,40.4,40.14,40.41,0,0,0,0,0,0,0,0,35.5,12.6,8.9,11.1,1.9,1.1,3.5,35.5,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,A,A,A,A,A,A,A,A,603,603,603,603,603,603,603,603,8,8,8,8,8,8,8,8,9.0,11.0,11.0,7.0,3.5,4.5,5.5,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,Kiowa Sweet Trey,USS Cool Luke,Gable Dodge,Rhythmless,Soprano Drive,Kiowa Sweet Trey,Kelsos Fusileer,Flying Penske,10,11,12,11,10,12,11,10,3,1,2,1,5,4,1,1,1,3,0,1,2,3,4,0,1,2,1,0,2,1,0,2,0,0,2,5,1,1,5,1,25,32,47,33,24,102,61,42,3,5,12,5,9,11,7,6,3,5,9,5,7,17,12,2,3,1,7,6,5,15,5,3,4,2,4,3,2,21,15,5,31.44,18.49,17.97,18.17,0.0,31.02,39.23,19.03,S,A,S,S,A,S,S,A,M,E,M,M,M,M,M,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1801,1821,1691,1693,1694,1499,1695,1815,303,311,304,310,318,301,314,300,244,264,264,272,284,300,284,268,0,0,0,0,0,0,0,0,2348,2397,2260,2275,2297,2101,2294,2384,4.8,5.3,5.1,4.95,4.7,5.5,5.7,5.7,8.0,8.0,8.0,6.1,3.7,8.0,4.9,8.0,R5,R5,R5,R5,R5,R5,R6,R6,E8,E8,E8,E6,E3,E8,E4,E8,61,66,66,68,71,75,71,67,71,71,71,73,75,71,74,71,4,5,6,7,8,3,1,2,337,346,338,345,353,335,349,333,4,5,6,7,8,5,3,6,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,663,783,814,724,693,1180,936,814
2,ABM201507015,BM,BM,BM,BM,BM,BM,BM,BM,2015-07-01,2015-07-01,2015-07-01,2015-07-01,2015-07-01,2015-07-01,2015-07-01,2015-07-01,A,A,A,A,A,A,A,A,5,5,5,5,5,5,5,5,31.73,31.73,31.73,31.73,31.73,31.73,31.73,31.73,F,F,F,F,F,F,F,F,60.0,59.0,61.0,62.0,69.0,59.0,61.0,58.0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,1,3,4,6,7,8,5,2,1,5,3,4,6,8,7,2,2,5,3,4,6,8,7,1,2,0,0,3,0,0,0,1,31.75,31.95,31.89,31.81,32.01,32.15,32.16,31.73,0,0,0,0,0,0,0,0,13.4,4.7,13.4,4.7,4.7,1.7,13.4,3.1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,B,B,B,B,B,B,B,B,503,503,503,503,503,503,503,503,8,8,8,8,8,8,8,8,11.0,11.0,9.0,5.5,3.5,4.5,9.0,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,Fuzzys Cannon,Kiowa Mon Manny,Oshkosh Slammer,WW's Dog Gone,WW's Dog Gone,Craigie Whistler,BD's Grayson,Little Andy,39,9,17,17,12,39,32,7,4,3,3,2,3,6,4,2,4,0,2,3,2,9,4,2,6,1,1,3,0,8,3,1,3,1,0,2,1,4,6,1,10,4,13,20,0,5,16,22,2,0,0,2,0,0,1,4,3,0,3,3,0,0,2,5,3,0,0,3,0,0,2,1,2,0,2,0,0,1,3,0,0.0,31.53,18.08,31.16,0.0,17.61,18.0,0.0,A,A,B,B,A,A,A,B,E,D,D,D,D,D,D,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1107,1209,897,1217,1217,1209,1108,1107,305,301,302,312,311,304,304,316,288,284,280,304,304,312,292,276,0,0,0,0,0,0,0,0,1700,1794,1479,1834,1833,1825,1705,1699,5.55,5.0,4.5,4.5,4.6,8.0,4.0,4.6,7.6,7.8,8.0,7.6,8.0,8.0,7.8,3.8,R6,R5,R4,R4,R5,R8,R4,R5,E7,E7,E8,E7,E8,E8,E7,E3,72,71,70,76,76,78,73,69,71,71,71,72,71,71,71,74,1,2,3,4,5,6,7,8,339,334,335,347,346,338,338,351,3,5,4,4,5,6,7,8,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,942,852,972,791,821,1125,852,699
3,ABM2015090715,BM,BM,BM,BM,BM,BM,BM,BM,2015-09-07,2015-09-07,2015-09-07,2015-09-07,2015-09-07,2015-09-07,2015-09-07,2015-09-07,A,A,A,A,A,A,A,A,15,15,15,15,15,15,15,15,40.36,40.36,40.36,40.36,40.36,40.36,40.36,40.36,F,F,F,F,F,F,F,F,61.0,60.0,79.0,81.0,56.0,69.0,63.0,53.0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,4,3,5,6,1,7,8,2,5,3,4,7,1,6,8,2,5,7,3,8,1,4,6,2,0,0,3,0,1,0,0,2,41.09,42.5,40.65,91.0,40.36,40.79,40.97,40.48,0,0,0,0,0,0,0,0,4.1,16.5,7.7,8.7,3.1,6.9,1.6,6.9,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,B,B,B,B,B,B,B,B,603,603,603,603,603,603,603,603,8,8,8,8,8,8,8,8,9.0,11.0,11.0,5.5,3.5,9.0,4.5,7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,Hi Noon Hero,Flying Penske,Craigie Whistler,Gable Dodge,Kiowa WW Brother,WW's Dog Gone,Lonesome Cry,Rhythmless,22,62,32,13,28,30,26,30,3,7,4,4,3,5,6,4,4,4,6,1,8,4,7,5,4,5,4,1,7,5,3,5,1,10,5,0,4,4,6,7,0,42,7,0,27,0,4,17,0,6,0,0,2,0,0,1,0,2,0,0,4,0,0,0,0,3,1,0,4,0,0,0,0,5,1,0,7,0,0,1,39.91,19.03,18.74,17.96,18.23,0.0,30.81,18.0,S,A,B,B,B,A,S,S,M,M,D,M,D,D,D,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1681,1390,987,1809,1818,1690,1691,1684,312,295,251,312,311,304,304,298,228,244,244,224,252,260,264,248,0,0,0,0,0,0,0,0,2221,1929,1483,2345,2381,2255,2260,2230,4.6,5.7,5.2854,4.2,4.6,4.6,5.7,2.1,5.0,7.8,8.0,7.2,8.0,8.0,8.0,8.0,R5,R6,R5,R4,R5,R5,R6,R2,E5,E7,E8,E7,E8,E8,E8,E8,57,61,61,56,63,65,66,62,73,71,71,71,71,71,71,71,1,2,3,4,5,6,7,8,347,327,279,347,346,338,338,331,3,4,5,4,5,6,7,6,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,583,1010,1071,555,859,889,948,1529
4,ABM201509236,BM,BM,BM,BM,BM,BM,BM,BM,2015-09-23,2015-09-23,2015-09-23,2015-09-23,2015-09-23,2015-09-23,2015-09-23,2015-09-23,A,A,A,A,A,A,A,A,6,6,6,6,6,6,6,6,31.28,31.28,31.28,31.28,31.28,31.28,31.28,31.28,F,F,F,F,F,F,F,F,72.0,55.0,60.0,63.0,64.0,55.0,79.0,57.0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,5,8,2,6,1,4,3,7,7,6,2,5,1,4,3,8,6,4,3,7,1,2,5,8,0,3,0,0,2,1,0,0,31.91,31.54,31.78,32.06,31.37,31.28,32.22,32.07,0,0,0,0,0,0,0,0,2.3,6.3,9.3,3.9,5.5,3.6,5.5,11.4,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,A,A,A,A,A,A,A,A,503,503,503,503,503,503,503,503,8,8,8,8,8,8,8,8,3.5,4.5,9.0,7.0,11.0,5.5,11.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,Djays Octane,Atascocita Bowl,Afleet Alex,Gable Dodge,Dragon Fire,Kelsos Fusileer,BD's Herby,Elway Drive,12,63,44,41,33,33,9,13,4,13,4,9,7,5,4,3,3,7,9,3,4,5,0,2,1,12,10,6,4,4,2,2,2,7,10,3,4,7,0,1,0,3,9,5,0,14,0,6,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,2,0,0,31.34,0.0,0.0,18.33,31.12,30.42,0.0,0.0,S,A,A,A,A,A,S,A,D,M,D,D,M,D,M,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,827,828,1111,1219,1218,1219,1109,827,304,304,312,313,333,312,317,304,312,328,324,324,308,324,296,312,0,0,0,0,0,0,0,0,1444,1461,1747,1857,1859,1855,1722,1444,4.5,4.7,4.9,3.91857,5.0,5.1,4.6,4.5,8.0,8.0,5.5,7.6,1.0,8.0,3.5,8.0,R4,R5,R5,R4,R5,R5,R5,R4,E8,E8,E5,E7,E1,E8,E3,E8,78,82,81,81,77,81,74,78,71,71,73,72,77,71,75,71,1,2,3,4,5,6,7,8,338,338,346,348,370,346,352,338,3,4,3,4,5,5,6,5,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,661,814,814,844,752,1117,571,814


In [93]:
pd.set_option('display.max_rows', 1000)

In [94]:
print(df.iloc[0])

Rid        ABM201501128
0                    BM
8            2015-01-12
16                    A
24                    8
32                 17.7
40                    S
48                 67.0
49                 74.0
50                 60.0
51                 57.0
52                 54.0
53                 79.0
54                 62.0
55                 60.0
56                    0
64                    6
65                    4
66                    5
67                    7
68                    8
69                    1
70                    2
71                    3
72                    4
73                    5
74                    2
75                    8
76                    7
77                    3
78                    1
79                    6
80                    5
81                    4
82                    3
83                    7
84                    8
85                    2
86                    1
87                    6
88                    7
89              

In [None]:
#rows 248 to 255 are dog names

In [None]:
#create database of dogs and their info and ELO Score of some kind
#create df of race info and which dogs race 

In [125]:
#get ELO rating of some kind

In [None]:
#use tensor flow

### Save for modeling later

In [22]:
df.to_csv('../model_ready_2015.csv', index=False)

### Basic Modeling

In [82]:
X = pd.get_dummies(df.drop(columns='Fin'), drop_first=True)
y = df[['Fin']]

In [83]:
X.shape

(478034, 716)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=7, stratify=y)
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [25]:
lr = LogisticRegression()
lr.fit(Xs_train, y_train)

LogisticRegression()

In [26]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.5044961045714133, 0.5054493727817253)

In [27]:
# lr.score(X_train, y_train), lr.score(X_test, y_test) #with 'Own', 'Sir', 'Dam', 'Ken', 'Trn' removed
# (0.452612157294251, 0.45277532560445494)