In [295]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss, auc, roc_curve
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
%matplotlib inline

In [296]:
data = pd.read_csv('../ult_sign_scrape/race_master/master_database.csv')
fe2 = pd.read_csv('../ult_sign_scrape/race_master/master_database_fe2.csv')

In [297]:
clean = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2_clean = fe2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
fe2_clean.head()

Unnamed: 0,age,city,gender,participant_id,runner_rank,state,status,Age_Rank,Gender_Rank,Total_races
0,26,Boulder,M,7148,88.39,CO,1,0.7174,0.6806,23
1,33,Salt Lake City,M,221721,90.0,UT,1,0.8132,0.6987,9
2,43,Pocatello,M,20020,83.63,ID,1,0.8272,0.7145,17
3,36,Azumino City,M,25441,73.22,JPN,1,0.8995,0.8957,17
4,33,Salt Lake City,M,22562,87.77,UT,1,0.6807,0.6522,8


In [298]:
fe2_clean.head()

Unnamed: 0,age,city,gender,participant_id,runner_rank,state,status,Age_Rank,Gender_Rank,Total_races
0,26,Boulder,M,7148,88.39,CO,1,0.7174,0.6806,23
1,33,Salt Lake City,M,221721,90.0,UT,1,0.8132,0.6987,9
2,43,Pocatello,M,20020,83.63,ID,1,0.8272,0.7145,17
3,36,Azumino City,M,25441,73.22,JPN,1,0.8995,0.8957,17
4,33,Salt Lake City,M,22562,87.77,UT,1,0.6807,0.6522,8


In [299]:
clean.describe()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
count,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0,9574.0
mean,42.769375,278617.4,69.932166,1.466263,0.728514,0.665302,11.602883
std,9.827515,295267.1,12.998598,0.588958,0.207281,0.174149,19.769396
min,0.0,489.0,0.0,1.0,0.0,0.0,1.0
25%,35.0,24215.25,63.01,1.0,0.6729,0.600425,1.0
50%,42.0,184900.0,69.84,1.0,0.76145,0.67855,2.0
75%,49.0,445702.2,77.46,2.0,0.8489,0.7642,15.0
max,79.0,1180368.0,100.0,3.0,1.0,1.0,247.0


Interesting observation - Very little spread between 25-75% in age of entrants (35-49). Max is 79 and min is ~12 (some cleaning of 0/1 ages may be necessary). 39-44 is the most common ages with nearly 400 in each category. Mid-life crisis???


In [300]:
clean.corr()

Unnamed: 0,age,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races
age,1.0,-0.279364,-0.217579,0.070143,-0.01394,-0.009565,-0.00413
participant_id,-0.279364,1.0,-0.188576,-0.009449,-0.016588,-0.022745,-0.001725
runner_rank,-0.217579,-0.188576,1.0,-0.189949,0.018092,0.01447,-0.001125
status,0.070143,-0.009449,-0.189949,1.0,0.010826,0.013578,0.007525
Age_Rank,-0.01394,-0.016588,0.018092,0.010826,1.0,0.7724,0.180406
Gender_Rank,-0.009565,-0.022745,0.01447,0.013578,0.7724,1.0,0.162115
Total_races,-0.00413,-0.001725,-0.001125,0.007525,0.180406,0.162115,1.0


Interesting observation - Appears that runner rank and age have the strongest relationships to the status column. Gender rank and age rank are similar in their relationship. Total races appears to have the weakest relationship for these features.

In [301]:
gender_dummies_fe = pd.get_dummies(fe2_clean.gender, prefix='gender')
gender_dummies = pd.get_dummies(clean.gender, prefix='gender')
state_dummies = pd.get_dummies(fe2_clean.state, prefix='Home_State')
clean = clean.join(gender_dummies)
fe2_clean = fe2_clean.join(gender_dummies_fe)
fe2_clean = fe2_clean.join(state_dummies)

In [302]:
def coding(col, codeDict):
    colCoded = pd.Series(col, copy=True)
    for key, value in codeDict.items():
        colCoded.replace(key, value, inplace=True)
    return colCoded
 
#Coding LoanStatus as Y=1, N=0:
print 'Before Coding:'
print pd.value_counts(clean["status"])
clean["status_coded"] = coding(clean["status"], {'1':0,'2':0, '3':1})
print '\nAfter Coding:'
print pd.value_counts(clean["status_coded"])

Before Coding:
1    5579
2    3526
3     469
Name: status, dtype: int64

After Coding:
0    9105
1     469
Name: status_coded, dtype: int64


In [275]:
clean.head()

Unnamed: 0,age,gender,participant_id,runner_rank,status,Age_Rank,Gender_Rank,Total_races,gender_F,gender_M,new_status,status_coded
0,26,M,7148,88.39,1,0.7174,0.6806,23,0.0,1.0,0,0
1,33,M,221721,90.0,1,0.8132,0.6987,9,0.0,1.0,0,0
2,43,M,20020,83.63,1,0.8272,0.7145,17,0.0,1.0,0,0
3,36,M,25441,73.22,1,0.8995,0.8957,17,0.0,1.0,0,0
4,33,M,22562,87.77,1,0.6807,0.6522,8,0.0,1.0,0,0


In [317]:
X

Unnamed: 0,age,runner_rank,status,Age_Rank,Gender_Rank,Total_races,gender_F,gender_M
0,26,88.39,1,0.7174,0.6806,23,0.0,1.0
1,33,90.00,1,0.8132,0.6987,9,0.0,1.0
2,43,83.63,1,0.8272,0.7145,17,0.0,1.0
3,36,73.22,1,0.8995,0.8957,17,0.0,1.0
4,33,87.77,1,0.6807,0.6522,8,0.0,1.0
5,40,97.10,1,0.7093,0.6828,29,1.0,0.0
6,43,92.88,1,0.7353,0.6750,3,0.0,1.0
7,42,86.89,1,0.7909,0.7061,3,0.0,1.0
8,24,81.55,1,0.7040,0.6567,1,0.0,1.0
9,40,78.75,1,0.7575,0.7261,1,0.0,1.0


Once adding gender dummies, it appears that gender in and of itself may not be a great predictor of race status. Appears to be ~75% male and 25% female particitation in these races.

In [303]:
y = clean.pop('status_coded')
clean.pop('participant_id')
clean.pop('gender')
X = clean

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)

In [304]:
model = LR()
model.fit_transform(X_train, y_train)



array([[ 3.,  0.,  1.],
       [ 2.,  1.,  0.],
       [ 1.,  0.,  1.],
       ..., 
       [ 1.,  0.,  1.],
       [ 2.,  0.,  1.],
       [ 1.,  0.,  1.]])

In [305]:
predicted = model.predict_proba(X_test)

In [306]:
log_loss(y_test, predicted)

0.00858366098108603

In [307]:
model2 = RFC(n_estimators=1000, criterion='entropy', random_state=1, n_jobs=2)
model2.fit(X_train, y_train)
predicted2 = model2.predict_proba(X_test)
log_loss(y_test, predicted2)

0.0035777936935930269

In [311]:
model3 = GBC()
model3.fit_transform(X_train, y_train)
predictions = model3.predict_proba(X_test)
log_loss(y_test, predictions)



0.00012132002698957209

In [313]:
fpr, tpr, thresholds = roc_curve(y_test, predictions)
#auc(fpr, tpr)

ValueError: bad input shape (2873, 2)

In [283]:

#fpr, tpr, thresholds = roc_curve(y_test, predictions)
#auc(fpr, tpr)

0.6046302345764859

ValueError: bad input shape (2873, 3)

In [285]:
neighbor = knn(n_neighbors=100, p=2, metric='minkowski')
neighbor.fit(X_train, y_train)
knn_predict = neighbor.predict_proba(X_test)
log_loss(y_test, knn_predict)

0.81956283718593126

In [248]:
target_names = ['Finished', 'DNF', 'DNS']
print classification_report(y_test, test, target_names=target_names)
print classification_report(y_test, predicted2, target_names=target_names)

             precision    recall  f1-score   support

   Finished       0.58      1.00      0.74      1670
        DNF       0.75      0.01      0.02      1054
        DNS       0.15      0.01      0.02       149

avg / total       0.62      0.58      0.44      2873



ValueError: Mix type of y not allowed, got types set(['continuous-multioutput', 'multiclass'])

In [314]:
predictions

array([[  9.99932250e-01,   6.77500730e-05],
       [  9.99932250e-01,   6.77500730e-05],
       [  9.99932250e-01,   6.77500730e-05],
       ..., 
       [  9.99932250e-01,   6.77500730e-05],
       [  9.99932250e-01,   6.77500730e-05],
       [  9.99932250e-01,   6.77500730e-05]])