In [1]:
# Import dependencies
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

# Read in data
olympic_df = pd.read_csv("./cleandatabmi.csv")
olympic_df.head()

Unnamed: 0,Count,Name,Sex,Age,Height,Weight,Team,Games,Season,Event,Medal,BMI
0,1,Juhamatti Tapio Aaltonen,M,28,184,85,Finland,2014 Winter,Winter,Ice Hockey Men's Ice Hockey,Bronze,25
1,2,Paavo Johannes Aaltonen,M,28,175,64,Finland,1948 Summer,Summer,Gymnastics Men's Individual All-Around,Bronze,21
2,3,Paavo Johannes Aaltonen,M,28,175,64,Finland,1948 Summer,Summer,Gymnastics Men's Team All-Around,Gold,21
3,4,Paavo Johannes Aaltonen,M,28,175,64,Finland,1948 Summer,Summer,Gymnastics Men's Horse Vault,Gold,21
4,5,Paavo Johannes Aaltonen,M,28,175,64,Finland,1948 Summer,Summer,Gymnastics Men's Pommelled Horse,Gold,21


In [2]:
#Create new DataFrame for medals
medal_won= olympic_df[['Medal']]

medal_won.head()

Unnamed: 0,Medal
0,Bronze
1,Bronze
2,Gold
3,Gold
4,Gold


In [3]:
#Assign Medal types a number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
medal_wonz = medal_won.copy()
medal_wonz['Medal'] = le.fit_transform(medal_wonz['Medal'])



In [5]:
#Drop non-feature columns
olympic_df.drop(['Count', 'Name', "Sex", "Team", "Games", "Season", "Event","Medal"], inplace=True, axis=1)
olympic_df.head()

Unnamed: 0,Age,Height,Weight,BMI
0,28,184,85,25
1,28,175,64,21
2,28,175,64,21
3,28,175,64,21
4,28,175,64,21


In [6]:
#check unique value count for encoding
olympic_df.nunique(axis=0)

Age        50
Height     86
Weight    129
BMI        37
dtype: int64

In [7]:
#create categorical variable list for encoding
olympic_cat = olympic_df.dtypes[olympic_df.dtypes == "object"].index.tolist()
olympic_cat

[]

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(olympic_df[olympic_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(olympic_cat)


In [9]:
#Merge encoded values and numbered medals and drop original columns
olympic_df = olympic_df.merge(encode_df,left_index=True, right_index=True)
olympic_df = olympic_df.drop(olympic_cat,1)

olympic_df = olympic_df.merge(medal_wonz,left_index=True, right_index=True)
olympic_df.head()

Unnamed: 0,Age,Height,Weight,BMI,Medal
0,28,184,85,25,0
1,28,175,64,21,0
2,28,175,64,21,1
3,28,175,64,21,1
4,28,175,64,21,1


In [10]:
# Split preprocessed data into features and target arrays
y = olympic_df["Medal"].values
X = olympic_df.drop(["Medal"],1).values
Xy=olympic_df.drop(["Medal"],1)
# Split the preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)


In [11]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Define model
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  60
hidden_nodes_layer2 = 4

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

nn.add(tf.keras.layers.Dense(units=4, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 60)                300       
_________________________________________________________________
dense_13 (Dense)             (None, 4)                 244       
_________________________________________________________________
dense_14 (Dense)             (None, 4)                 20        
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 5         
Total params: 569
Trainable params: 569
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Compile model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [17]:
# Train model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [18]:
# Evaluate model using test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

236/236 - 0s - loss: 0.0502 - accuracy: 0.3369
Loss: 0.050221387296915054, Accuracy: 0.33686721324920654


# Random Forrest


In [19]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [20]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=7)

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [23]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1","Actual 2"], columns=["Predicted 0", "Predicted 1", "Predicted 2"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,856,836,827
Actual 1,799,943,800
Actual 2,829,866,790


In [24]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,856,836,827
Actual 1,799,943,800
Actual 2,829,866,790


Accuracy Score : 0.3430956798303737
Classification Report
              precision    recall  f1-score   support

           0       0.34      0.34      0.34      2519
           1       0.36      0.37      0.36      2542
           2       0.33      0.32      0.32      2485

    accuracy                           0.34      7546
   macro avg       0.34      0.34      0.34      7546
weighted avg       0.34      0.34      0.34      7546



In [26]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.39720664, 0.25407583, 0.26378261, 0.08493492])

In [27]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, Xy.columns), reverse=True)

[(0.39720663649503263, 'Age'),
 (0.2637826095112518, 'Weight'),
 (0.2540758326321868, 'Height'),
 (0.08493492136152862, 'BMI')]