In [1]:
# Import dependencies
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

# Read in data
olympic_df = pd.read_csv("./archive/SampleData.csv")
olympic_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1188,"Benjamin Alexandro ""Ben"" Agosto",M,28,178,75,United States-2,USA,2010 Winter,2010,Winter,Vancouver,Figure Skating,Figure Skating Mixed Ice Dancing,Gold
1,2773,"Michael Charles ""Mike"" Aljoe",M,23,185,100,United States-1,USA,1988 Winter,1988,Winter,Calgary,Bobsleigh,Bobsleigh Men's Two,Gold
2,7068,Jill Bakken (-Linder),F,25,167,65,United States-2,USA,2002 Winter,2002,Winter,Salt Lake City,Bobsleigh,Bobsleigh Women's Two,Gold
3,12170,Dain J. Blanton,M,28,190,92,United States-1,USA,2000 Summer,2000,Summer,Sydney,Beach Volleyball,Beach Volleyball Men's Beach Volleyball,Gold
4,6312,Emily Dianne Azevedo,F,26,172,80,United States-3,USA,2010 Winter,2010,Winter,Vancouver,Bobsleigh,Bobsleigh Women's Two,Gold


In [2]:
#Create new DataFrame for medals
medal_won= olympic_df[['Medal']]

medal_won.head()

Unnamed: 0,Medal
0,Gold
1,Gold
2,Gold
3,Gold
4,Gold


In [3]:
#Assign Medal types a number
medal_wonx= medal_won.replace("Gold", 0)
medal_wony= medal_wonx.replace("Silver",1)
medal_wonz= medal_wony.replace("Bronze",2)
medal_wonz

Unnamed: 0,Medal
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [4]:
#Drop non-feature columns
olympic_df.drop(['ID', 'Name', "Team", "NOC", "Games", "City", "Event","Medal"], inplace=True, axis=1)
olympic_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Year,Season,Sport
0,M,28,178,75,2010,Winter,Figure Skating
1,M,23,185,100,1988,Winter,Bobsleigh
2,F,25,167,65,2002,Winter,Bobsleigh
3,M,28,190,92,2000,Summer,Beach Volleyball
4,F,26,172,80,2010,Winter,Bobsleigh


In [5]:
#check unique value count for encoding
olympic_df.nunique(axis=0)

Sex        2
Age       18
Height    20
Weight    29
Year      17
Season     2
Sport      7
dtype: int64

In [6]:
#create categorical variable list for encoding
olympic_cat = olympic_df.dtypes[olympic_df.dtypes == "object"].index.tolist()
olympic_cat

['Sex', 'Season', 'Sport']

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(olympic_df[olympic_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(olympic_cat)
encode_df.head()

Unnamed: 0,Sex_F,Sex_M,Season_Summer,Season_Winter,Sport_Beach Volleyball,Sport_Bobsleigh,Sport_Canoeing,Sport_Figure Skating,Sport_Luge,Sport_Table Tennis,Sport_Tennis
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Merge encoded values and numbered medals and drop original columns
olympic_df = olympic_df.merge(encode_df,left_index=True, right_index=True)
olympic_df = olympic_df.drop(olympic_cat,1)

olympic_df = olympic_df.merge(medal_wonz,left_index=True, right_index=True)
olympic_df.head()

Unnamed: 0,Age,Height,Weight,Year,Sex_F,Sex_M,Season_Summer,Season_Winter,Sport_Beach Volleyball,Sport_Bobsleigh,Sport_Canoeing,Sport_Figure Skating,Sport_Luge,Sport_Table Tennis,Sport_Tennis,Medal
0,28,178,75,2010,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,23,185,100,1988,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,25,167,65,2002,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,28,190,92,2000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,26,172,80,2010,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
# Split preprocessed data into features and target arrays
y = olympic_df["Medal"].values
X = olympic_df.drop(["Medal"],1).values

# Split the preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)