In [98]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
%matplotlib inline

In [115]:
data = pd.read_csv('../ult_sign_scrape/race_master/master_database.csv')
fe2 = pd.read_csv('../ult_sign_scrape/race_master/master_database_fe2.csv')

In [119]:
clean = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2_clean = fe2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,age,age_rank,city,gender,participant_id,runner_rank,state,status,time,Age_Rank,Gender_Rank,Total_races
0,0,0,0,26,,,M,7148,88.39,,1,,0.7174,0.6806,23.0
1,1,1,1,33,,,M,221721,90.0,,1,,0.8132,0.6987,9.0
2,2,2,2,43,,,M,20020,83.63,,1,,0.8272,0.7145,17.0
3,3,3,3,36,,,M,25441,73.22,,1,,0.8995,0.8957,17.0
4,4,4,4,33,,,M,22562,87.77,,1,,0.6807,0.6522,8.0


In [114]:
clean.status.value_counts()

1    5579
2    3526
3     469
Name: status, dtype: int64

In [102]:
clean.describe()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
count,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0
mean,42.769375,278617.4,69.932166,1.466263,0.728514,0.665302,11.602883
std,9.827515,295267.1,12.998598,0.588958,0.207281,0.174149,19.769396
min,0.0,489.0,0.0,1.0,0.0,0.0,1.0
25%,35.0,24215.25,63.01,1.0,0.6729,0.600425,1.0
50%,42.0,184900.0,69.84,1.0,0.76145,0.67855,2.0
75%,49.0,445702.2,77.46,2.0,0.8489,0.7642,15.0
max,79.0,1180368.0,100.0,3.0,1.0,1.0,247.0


Interesting observation - Very little spread between 25-75% in age of entrants (35-49). Max is 79 and min is ~12 (some cleaning of 0/1 ages may be necessary). 39-44 is the most common ages with nearly 400 in each category. Mid-life crisis???


In [103]:
clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
age,1.0,-0.279364,-0.217579,0.070143,-0.01394,-0.009565,-0.00413
participant_id,-0.279364,1.0,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725
runner_rank,-0.217579,-0.188576,1.0,-0.189949,0.018092,0.01447,-0.001125
status,0.070143,-0.009449,-0.189949,1.0,0.010826,0.013578,0.007525
Age_Rank,-0.01394,-0.016588,0.018092,0.010826,1.0,0.7724,0.180406
Gender_Rank,-0.009565,-0.022745,0.01447,0.013578,0.7724,1.0,0.162115
Total_races,-0.00413,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.0


Interesting observation - Appears that runner rank and age have the strongest relationships to the status column. Gender rank and age rank are similar in their relationship. Total races appears to have the weakest relationship for these features.

In [104]:
gender_dummies = pd.get_dummies(clean.gender, prefix='gender')
clean = clean.join(gender_dummies)
clean.head() 
clean.pop('gender')

0       M
1       M
2       M
3       M
4       M
5       F
6       M
7       M
8       M
9       M
10      M
11      M
12      F
13      M
14      M
15      M
16      M
17      M
18      F
19      M
20      M
21      M
22      M
23      M
24      M
25      M
26      M
27      M
28      F
29      M
       ..
9544    M
9545    M
9546    M
9547    M
9548    M
9549    M
9550    M
9551    M
9552    M
9553    F
9554    M
9555    M
9556    M
9557    F
9558    M
9559    M
9560    F
9561    F
9562    M
9563    M
9564    M
9565    M
9566    M
9567    F
9568    M
9569    F
9570    M
9571    F
9572    M
9573    M
Name: gender, dtype: object

In [105]:
clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races,gender_F,gender_M
age,1.0,-0.279364,-0.217579,0.070143,-0.01394,-0.009565,-0.00413,-0.048496,0.048496
participant_id,-0.279364,1.0,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725,-0.027851,0.027851
runner_rank,-0.217579,-0.188576,1.0,-0.189949,0.018092,0.01447,-0.001125,0.191374,-0.191374
status,0.070143,-0.009449,-0.189949,1.0,0.010826,0.013578,0.007525,-0.002032,0.002032
Age_Rank,-0.01394,-0.016588,0.018092,0.010826,1.0,0.7724,0.180406,-0.006912,0.006912
Gender_Rank,-0.009565,-0.022745,0.01447,0.013578,0.7724,1.0,0.162115,-0.011807,0.011807
Total_races,-0.00413,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.0,0.007073,-0.007073
gender_F,-0.048496,-0.027851,0.191374,-0.002032,-0.006912,-0.011807,0.007073,1.0,-1.0
gender_M,0.048496,0.027851,-0.191374,0.002032,0.006912,0.011807,-0.007073,-1.0,1.0


Once adding gender dummies, it appears that gender in and of itself may not be a great predictor of race status. Appears to be ~75% male and 25% female particitation in these races.

In [106]:
y = clean.pop('status')
clean.pop('participant_id')
X = clean

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)

In [107]:
model = LR()
model.fit_transform(X_train, y_train)



array([[ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [108]:
predicted = model.predict_proba(X_test)

In [109]:
log_loss(y_test, predicted)

0.80008236700101576

In [110]:
model2 = RFC()
model2.fit(X_train, y_train)
predicted2 = model2.predict_proba(X_test)
log_loss(y_test, predicted2)

2.2519645050116615

In [111]:
model3 = GBC(loss='deviance', learning_rate=0.01, \
        n_estimators=4700, subsample=0.75, criterion='friedman_mse')
model3.fit_transform(X_train, y_train)



array([[ 80.19  ,   0.7965,   0.7065],
       [  0.    ,   0.663 ,   0.598 ],
       [ 78.95  ,   0.7693,   0.6814],
       ..., 
       [ 87.09  ,   0.8765,   0.6224],
       [ 59.49  ,   0.7184,   0.6224],
       [ 71.58  ,   0.6326,   0.6148]])

In [93]:
predictions = model3.predict_proba(X_test)

In [94]:
log_loss(y_test, predictions)

0.79572770724168207