In [1]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
import seaborn as sns

import io
from scipy import misc

%matplotlib inline

In [2]:
data = pd.read_csv('athlete_events.csv')

In [3]:
data.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


# This is the part of converting all medals to 1 or 0

In [4]:
data.loc[data.Medal == 'Gold', 'Medal'] = 1
data.loc[data.Medal == 'Bronze', 'Medal'] = 1
data.loc[data.Medal == 'Silver', 'Medal'] = 1

In [5]:
data.loc[data.Medal.isnull(), 'Medal'] = 0

In [6]:
data[data.notnull()].head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,0
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,0
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,0
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,1
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,0


In [7]:
data = data.drop(columns=['ID', 'Name', 'Games', 'Year', 'Season', 'NOC', 'City'])

In [8]:
data.dropna()

Unnamed: 0,Sex,Age,Height,Weight,Team,Sport,Event,Medal
0,M,24.0,180.0,80.0,China,Basketball,Basketball Men's Basketball,0
1,M,23.0,170.0,60.0,China,Judo,Judo Men's Extra-Lightweight,0
4,F,21.0,185.0,82.0,Netherlands,Speed Skating,Speed Skating Women's 500 metres,0
5,F,21.0,185.0,82.0,Netherlands,Speed Skating,"Speed Skating Women's 1,000 metres",0
6,F,25.0,185.0,82.0,Netherlands,Speed Skating,Speed Skating Women's 500 metres,0
7,F,25.0,185.0,82.0,Netherlands,Speed Skating,"Speed Skating Women's 1,000 metres",0
8,F,27.0,185.0,82.0,Netherlands,Speed Skating,Speed Skating Women's 500 metres,0
9,F,27.0,185.0,82.0,Netherlands,Speed Skating,"Speed Skating Women's 1,000 metres",0
10,M,31.0,188.0,75.0,United States,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,0
11,M,31.0,188.0,75.0,United States,Cross Country Skiing,Cross Country Skiing Men's 50 kilometres,0


In [9]:
print(data[data.Medal == 1].count())

Sex       39783
Age       39051
Height    31072
Weight    30456
Team      39783
Sport     39783
Event     39783
Medal     39783
dtype: int64


In [10]:
data = data[data.Weight.notnull()]
data = data[data.Height.notnull()]
data = data[data.Age.notnull()]

In [11]:
print(data[data.Medal == 0].count())

Sex       175984
Age       175984
Height    175984
Weight    175984
Team      175984
Sport     175984
Event     175984
Medal     175984
dtype: int64


In [12]:
dataPositive = data[data.Medal == 1]
dataNegative = data[data.Medal == 0]

In [13]:
dataNegative = dataNegative.sample(n=30181)

In [28]:
print(dataNegative.count())

Sex       30181
Age       30181
Height    30181
Weight    30181
Team      30181
Sport     30181
Event     30181
Medal     30181
dtype: int64


In [15]:
finalData = pd.concat([dataPositive, dataNegative], ignore_index=True)

In [27]:
print(finalData[finalData.Medal==0].count())

Sex       30181
Age       30181
Height    30181
Weight    30181
Team      30181
Sport     30181
Event     30181
Medal     30181
dtype: int64


In [26]:
finalData.head()

Unnamed: 0,Sex,Age,Height,Weight,Team,Sport,Event,Medal
6948,1,19.0,176.0,76.0,143,3,41,1
52924,0,19.0,169.0,58.0,27,0,7,0
48305,1,22.0,175.0,69.0,162,22,278,0
1979,1,28.0,188.0,86.0,102,25,300,1
5199,0,24.0,159.0,42.0,135,3,88,1


In [18]:
finalData = finalData.sample(frac=1)

In [29]:
export_csv = finalData.to_csv (r'finalData.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path


In [19]:
from sklearn import preprocessing


for column in finalData.columns:
    if finalData[column].dtype == type(object):
        le = preprocessing.LabelEncoder()
        finalData[column] = le.fit_transform(finalData[column])

# Data has been processed and now gonna try decision tree

In [20]:
train, test = train_test_split(finalData, test_size = 0.2)

In [21]:
c = DecisionTreeClassifier(min_samples_split = 15)
features = ["Sex", "Age", "Height", "Weight", "Team", "Sport", "Event"]

In [22]:
X_train = train[features]
y_train = train['Medal']

X_test = test[features]
y_test = test["Medal"]

In [23]:
dt = c.fit(X_train, y_train)

In [24]:
y_pred = c.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, y_pred) *100
print(round(score,1))

70.6
