In [221]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss, auc, roc_curve
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [167]:
data = pd.read_csv('../ult_sign_scrape/race_master/master_database.csv')
fe2 = pd.read_csv('../ult_sign_scrape/race_master/master_database_fe2.csv')

In [168]:
clean = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2_clean = fe2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2_clean.head()

Unnamed: 0,age,city,gender,participant_id,runner_rank,state,status,Age_Rank,Gender_Rank,Total_races
0,26,Boulder,M,7148,88.39,CO,1,0.7174,0.6806,23
1,33,Salt Lake City,M,221721,90.0,UT,1,0.8132,0.6987,9
2,43,Pocatello,M,20020,83.63,ID,1,0.8272,0.7145,17
3,36,Azumino City,M,25441,73.22,JPN,1,0.8995,0.8957,17
4,33,Salt Lake City,M,22562,87.77,UT,1,0.6807,0.6522,8


In [169]:
fe2_clean.head()

Unnamed: 0,age,city,gender,participant_id,runner_rank,state,status,Age_Rank,Gender_Rank,Total_races
0,26,Boulder,M,7148,88.39,CO,1,0.7174,0.6806,23
1,33,Salt Lake City,M,221721,90.0,UT,1,0.8132,0.6987,9
2,43,Pocatello,M,20020,83.63,ID,1,0.8272,0.7145,17
3,36,Azumino City,M,25441,73.22,JPN,1,0.8995,0.8957,17
4,33,Salt Lake City,M,22562,87.77,UT,1,0.6807,0.6522,8


In [170]:
clean.describe()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
count,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0
mean,42.769375,278617.4,69.932166,1.466263,0.728514,0.665302,11.602883
std,9.827515,295267.1,12.998598,0.588958,0.207281,0.174149,19.769396
min,0.0,489.0,0.0,1.0,0.0,0.0,1.0
25%,35.0,24215.25,63.01,1.0,0.6729,0.600425,1.0
50%,42.0,184900.0,69.84,1.0,0.76145,0.67855,2.0
75%,49.0,445702.2,77.46,2.0,0.8489,0.7642,15.0
max,79.0,1180368.0,100.0,3.0,1.0,1.0,247.0


Interesting observation - Very little spread between 25-75% in age of entrants (35-49). Max is 79 and min is ~12 (some cleaning of 0/1 ages may be necessary). 39-44 is the most common ages with nearly 400 in each category. Mid-life crisis???


In [171]:
clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
age,1.0,-0.279364,-0.217579,0.070143,-0.01394,-0.009565,-0.00413
participant_id,-0.279364,1.0,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725
runner_rank,-0.217579,-0.188576,1.0,-0.189949,0.018092,0.01447,-0.001125
status,0.070143,-0.009449,-0.189949,1.0,0.010826,0.013578,0.007525
Age_Rank,-0.01394,-0.016588,0.018092,0.010826,1.0,0.7724,0.180406
Gender_Rank,-0.009565,-0.022745,0.01447,0.013578,0.7724,1.0,0.162115
Total_races,-0.00413,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.0


Interesting observation - Appears that runner rank and age have the strongest relationships to the status column. Gender rank and age rank are similar in their relationship. Total races appears to have the weakest relationship for these features.

In [172]:
gender_dummies_fe = pd.get_dummies(fe2_clean.gender, prefix='gender')
gender_dummies = pd.get_dummies(clean.gender, prefix='gender')
state_dummies = pd.get_dummies(fe2_clean.state, prefix='Home_State')
clean = clean.join(gender_dummies)
fe2_clean = fe2_clean.join(gender_dummies_fe)
fe2_clean = fe2_clean.join(state_dummies)

In [173]:
clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races,gender_F,gender_M
age,1.0,-0.279364,-0.217579,0.070143,-0.01394,-0.009565,-0.00413,-0.048496,0.048496
participant_id,-0.279364,1.0,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725,-0.027851,0.027851
runner_rank,-0.217579,-0.188576,1.0,-0.189949,0.018092,0.01447,-0.001125,0.191374,-0.191374
status,0.070143,-0.009449,-0.189949,1.0,0.010826,0.013578,0.007525,-0.002032,0.002032
Age_Rank,-0.01394,-0.016588,0.018092,0.010826,1.0,0.7724,0.180406,-0.006912,0.006912
Gender_Rank,-0.009565,-0.022745,0.01447,0.013578,0.7724,1.0,0.162115,-0.011807,0.011807
Total_races,-0.00413,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.0,0.007073,-0.007073
gender_F,-0.048496,-0.027851,0.191374,-0.002032,-0.006912,-0.011807,0.007073,1.0,-1.0
gender_M,0.048496,0.027851,-0.191374,0.002032,0.006912,0.011807,-0.007073,-1.0,1.0


In [174]:
fe2_clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races,gender_F,gender_M,Home_State_AB,...,Home_State_Ut,Home_State_VA,Home_State_VT,Home_State_WA,Home_State_WI,Home_State_WV,Home_State_WY,Home_State_ZAF,Home_State_ZWE,Home_State_ca
age,1.000000,-0.279364,-0.217579,0.070143,-0.013940,-0.009565,-0.004130,-0.048496,0.048496,-0.007183,...,-0.003920,-0.014665,-0.008604,0.013574,-0.032538,-0.015205,0.005579,-0.005358,0.002320,-0.007041
participant_id,-0.279364,1.000000,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725,-0.027851,0.027851,0.030315,...,-0.006289,-0.009304,-0.017705,-0.059753,0.004351,0.024748,-0.037672,0.016430,0.006622,-0.008977
runner_rank,-0.217579,-0.188576,1.000000,-0.189949,0.018092,0.014470,-0.001125,0.191374,-0.191374,0.021805,...,-0.002023,0.022798,0.045628,0.025627,0.001577,0.001785,0.019536,-0.003206,0.006029,-0.001000
status,0.070143,-0.009449,-0.189949,1.000000,0.010826,0.013578,0.007525,-0.002032,0.002032,-0.012540,...,-0.008092,0.026535,-0.008115,-0.022994,-0.004248,0.009796,-0.017666,0.001435,-0.008092,-0.008092
Age_Rank,-0.013940,-0.016588,0.018092,0.010826,1.000000,0.772400,0.180406,-0.006912,0.006912,-0.002395,...,0.006153,-0.018787,0.006656,0.007143,-0.008803,-0.004870,0.002918,-0.014490,-0.004202,0.013387
Gender_Rank,-0.009565,-0.022745,0.014470,0.013578,0.772400,1.000000,0.162115,-0.011807,0.011807,-0.000003,...,0.008281,-0.012461,0.001125,0.003838,-0.005963,-0.013327,0.004734,-0.001148,-0.005846,0.018417
Total_races,-0.004130,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.000000,0.007073,-0.007073,-0.003619,...,-0.004448,-0.009058,-0.001663,0.011377,-0.004583,0.035150,0.017323,0.010848,0.019852,-0.005482
gender_F,-0.048496,-0.027851,0.191374,-0.002032,-0.006912,-0.011807,0.007073,1.000000,-1.000000,0.038631,...,-0.006010,-0.017809,0.026299,0.033099,0.007987,0.008281,0.015221,-0.014726,-0.006010,-0.006010
gender_M,0.048496,0.027851,-0.191374,0.002032,0.006912,0.011807,-0.007073,-1.000000,1.000000,-0.038631,...,0.006010,0.017809,-0.026299,-0.033099,-0.007987,-0.008281,-0.015221,0.014726,0.006010,0.006010
Home_State_AB,-0.007183,0.030315,0.021805,-0.012540,-0.002395,-0.000003,-0.003619,0.038631,-0.038631,1.000000,...,-0.000967,-0.009918,-0.003992,-0.016092,-0.006282,-0.003210,-0.006282,-0.002370,-0.000967,-0.000967


Once adding gender dummies, it appears that gender in and of itself may not be a great predictor of race status. Appears to be ~75% male and 25% female particitation in these races.

In [178]:
y = clean.pop('status')
clean.pop('participant_id')
clean.pop('gender')
X = clean

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)

In [179]:
model = LR()
model.fit_transform(X_train, y_train)



array([[ 41.  ,  74.33],
       [ 35.  ,  69.83],
       [ 45.  ,  83.34],
       ..., 
       [ 45.  ,  75.3 ],
       [ 31.  ,  88.92],
       [ 50.  ,  69.04]])

In [180]:
predicted = model.predict_proba(X_test)

In [181]:
log_loss(y_test, predicted)

0.82292842582147796

In [200]:
model2 = RFC(n_estimators=1000, criterion='entropy', random_state=1, n_jobs=2)
model2.fit(X_train, y_train)
predicted2 = model2.predict_proba(X_test)
log_loss(y_test, predicted2)

0.8182420015665961

In [183]:
model3 = GBC(loss='deviance', learning_rate=0.01, \
        n_estimators=4700, subsample=0.75, criterion='friedman_mse')
model3.fit_transform(X_train, y_train)



array([[  5.47698000e+05,   7.43300000e+01,   8.48300000e-01,
          7.03600000e-01],
       [  3.11094000e+05,   6.98300000e+01,   7.61700000e-01,
          6.02200000e-01],
       [  9.80090000e+04,   8.33400000e+01,   8.97200000e-01,
          8.68900000e-01],
       ..., 
       [  5.87170000e+05,   7.53000000e+01,   6.56000000e-01,
          6.56000000e-01],
       [  1.32450000e+04,   8.89200000e+01,   7.76100000e-01,
          7.03400000e-01],
       [  9.67000000e+03,   6.90400000e+01,   8.02300000e-01,
          6.65200000e-01]])

In [191]:
predictions = model3.predict_proba(X_test)

In [229]:
log_loss(y_test, predictions)
#fpr, tpr, thresholds = roc_curve(y_test, predictions)
#auc(fpr, tpr)

0.81944473233090942

In [202]:
#fpr, tpr, thresholds = roc_curve(y_test, predictions)
#auc(fpr, tpr)


In [212]:
neighbor = knn(n_neighbors=1000, p=2, metric='minkowski')
neighbor.fit(X_train, y_train)
knn_predict = neighbor.predict_proba(X_test)
log_loss(y_test, knn_predict)

0.83537782751891698

In [216]:
test = OneVsOneClassifier(RFC(n_estimators=1000, criterion='entropy', random_state=1, n_jobs=2)).fit(X_train, y_train).predict(X_test)

In [222]:
confusion_matrix(y_test, test)

array([[1410,  260,    0],
       [ 691,  363,    0],
       [ 112,   37,    0]])