# Imports

In [1]:
import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
#from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

#from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

%matplotlib inline

# Data Prep, train test split

In [2]:
df = pd.read_csv('../data/athletes_clean.csv')

In [3]:
df.shape

(11251, 12)

In [4]:
df.columns

Index(['id', 'name', 'nationality', 'sex', 'dob', 'height', 'weight', 'sport',
       'gold', 'silver', 'bronze', 'age'],
      dtype='object')

In [5]:
# setting up X and y

features_df = df.drop(columns = ['id', 'name', 'dob', 'sport'])
cols_to_dummy = ['nationality', 'sex']
features_df = pd.get_dummies(features_df, columns = cols_to_dummy, drop_first = True)
X = features_df

y = df['sport']

In [6]:
X.shape

(11251, 213)

In [7]:
y.value_counts(normalize = True)

athletics            0.210026
aquatics             0.128433
football             0.054306
rowing               0.048618
cycling              0.046663
hockey               0.038397
judo                 0.034841
shooting             0.034664
volleyball           0.034130
sailing              0.033686
handball             0.032264
wrestling            0.031375
canoe                0.029420
gymnastics           0.028797
rugby sevens         0.026664
basketball           0.025598
weightlifting        0.022931
fencing              0.021865
equestrian           0.019732
tennis               0.017421
badminton            0.015288
table tennis         0.015288
taekwondo            0.011377
archery              0.011377
golf                 0.010666
triathlon            0.009777
modern pentathlon    0.006399
Name: sport, dtype: float64

In [10]:
# Create training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   stratify = y,
                                                   random_state = 5)

# Baseline 

We'll just go ahead and say our baseline model is guessing that an athlete's sport is the one with the most athletes in our dataset, which is track. This would give us a baseline accuracy of 21%

# KNN

In [10]:
# instantiate knn model

knn = KNeighborsClassifier(n_neighbors = 29)
cross_val_score(knn, X_train, y_train, cv = 5).mean()

0.2516006146809867

# Decision Tree

In [13]:
dt = DecisionTreeClassifier(random_state = 5)

In [14]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=5, splitter='best')

In [15]:
# Evaluate model.

print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.9977482815833136
Score on testing set: 0.27657305367934587


In [17]:
params = {
    'max_depth' : [2,3,5,7],
    'min_samples_split' : [5, 10, 15, 20],
    'min_samples_leaf' : [2, 3, 4, 5, 6],
    'ccp_alpha' : [0, 0.001, 0.01, 1, 10]
}

gs = GridSearchCV( DecisionTreeClassifier(random_state = 5),
                    params, 
                    cv = 5)



In [18]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=5, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'ccp_alpha': [0, 0.001, 0.01, 1, 10],
                         'max_depth': [2, 3, 5, 7],
            

In [19]:
gs.best_score_

0.2548004393789137

In [20]:
gs.best_params_

{'ccp_alpha': 0,
 'max_depth': 7,
 'min_samples_leaf': 2,
 'min_samples_split': 15}

# Random Forest and Extra Trees

In [8]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [11]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.3967777709105313

In [12]:
cross_val_score(et, X_train, y_train, cv=5).mean()

0.38930861030967223

In [13]:
X_train.shape

(8438, 213)

In [14]:
rf = RandomForestClassifier(random_state = 5)
params = {
    'n_estimators' : [50, 100, 150],
    'max_features' : [None, 'auto'],
    'max_depth' : [None, 2, 4, 6, 10]
}
gs = GridSearchCV(rf, param_grid = params, cv = 5)
gs.fit(X_train, y_train)
print(gs.best_score_) # cross val score
gs.best_params_

0.4013984835246954


{'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}

In [15]:
gs_model = gs.best_estimator_

In [16]:
gs_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [17]:
gs_model.score(X_train, y_train)

0.9977482815833136

In [18]:
gs_model.score(X_test, y_test)

0.4088162104514753

Hey not bad! Almost twice as good as randomly guessing track.