In [1]:
# Import our dependencies
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf

In [2]:
# Import our clean dataset
file_path = Path("clean_data/combined_olympic_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4
4,Armenia,2912403,3986.231624,0.733,0.299,37,2
5,Australia,23475686,62510.79117,0.935,0.123,80,38
6,Austria,8546356,51717.49594,0.885,0.083,72,17
7,Azerbaijan,9535079,7891.313147,0.751,0.33,29,9
8,Burundi,9844297,274.857948,0.4,0.483,20,0
9,Belgium,11209057,47700.54036,0.89,0.076,76,3


In [3]:
df = df.set_index("country_name",drop=True)
df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


In [4]:
new_df = df.rename(columns={"total": "count_of_medals"})
new_df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


### Split our preprocessed data into our features and target arrays

In [5]:
# Create our target
y = new_df["count_of_medals"].values

In [6]:
# Create our features
X = new_df.drop(["count_of_medals"], axis=1).values

In [7]:
# Splitting data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                480       
_________________________________________________________________
dense_1 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 2,941
Trainable params: 2,941
Non-trainable params: 0
_________________________________________________________________


In [10]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [11]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

In [13]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=25,callbacks=[cp_callback])

Train on 119 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 00009: saving model to checkpoints/weights.09.hdf5
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 00017: saving model to checkpoints/weights.17.hdf5
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

40/1 - 0s - loss: -3.3782e+01 - accuracy: 0.0750
Loss: -15.590364837646485, Accuracy: 0.07500000298023224


### Saving the model

In [15]:
# Export our model to HDF5 file
nn.save("DL_trained_model.h5")