In [2]:
# Import our dependencies
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
# Keras
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist

In [4]:
# Import our clean dataset
file_path = Path("clean_data/combined_olympic_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4
4,Armenia,2912403,3986.231624,0.733,0.299,37,2
5,Australia,23475686,62510.79117,0.935,0.123,80,38
6,Austria,8546356,51717.49594,0.885,0.083,72,17
7,Azerbaijan,9535079,7891.313147,0.751,0.33,29,9
8,Burundi,9844297,274.857948,0.4,0.483,20,0
9,Belgium,11209057,47700.54036,0.89,0.076,76,3


In [3]:
df["total"].value_counts()

0      75
1      18
2       8
3       6
4       5
6       5
8       4
17      3
5       3
9       3
12      3
13      3
7       2
15      2
18      2
38      2
100     1
19      1
16      1
21      1
23      1
31      1
36      1
43      1
44      1
46      1
50      1
63      1
70      1
97      1
132     1
Name: total, dtype: int64

In [4]:
df = df.set_index("country_name",drop=True)
df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


In [5]:
new_df = df.rename(columns={"total": "count_of_medals"})
new_df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


### Split our preprocessed data into our features and target arrays

In [6]:
# Create our target
y = new_df["count_of_medals"].values

In [7]:
# Optimizing and transforming features
# Transforming CPI to values between 0-1
new_df["corruption_perceptions_index"] = new_df["corruption_perceptions_index"] / 100
# Transforming GII to invert values. Higher values will now correspond to more gender equality.
new_df["gender_inequality_index"] = 1 - new_df["gender_inequality_index"]
# Replacing GDP per capita with total GDP
new_df["gdp_total"] = new_df["population"] * new_df["gdp_per_capita"]
# X = new_df.drop(["count_of_medals","population","gdp_per_capita", "human_development_index", "gender_inequality_index", "corruption_perceptions_index"], axis=1)

Unnamed: 0_level_0,human_development_index
country_name,Unnamed: 1_level_1
Afghanistan,0.465
Albania,0.733
United Arab Emirates,0.835
Argentina,0.836
Armenia,0.733


In [8]:
# Create our features
X = new_df.drop(["count_of_medals", "gdp_per_capita"], axis=1).values
X

In [10]:
# Create a StandardScaler instances
# scaler = MinMaxScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)



In [11]:
# Splitting data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)

In [12]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train[0])
hidden_nodes_layer1 = 2 * number_input_features
hidden_nodes_layer2 = 2 * number_input_features

nn = tf.keras.models.Sequential()
number_input_features 

1

In [22]:
# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# # Second hidden layer
#nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))
# After bucketing the medal count, you'd have to change this to have mulitiple outputs. 
# In other words, you'd need one node for each bucket. And then you'd change the activation to "softmax."
# You'd also have to one-hot encode your y values. 

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 2)                 4         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 6         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 3         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 4         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 3         
Total params: 13
Trainable params: 13
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [24]:
# Compile the model
nn.compile(loss="mean_squared_error", optimizer="adam")

In [25]:
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

In [26]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=25,callbacks=[cp_callback])

Train on 119 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 00009: saving model to checkpoints/weights.09.hdf5
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 00017: saving model to checkpoints/weights.17.hdf5
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [27]:
# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [28]:
y_train_pred = nn.predict(X_train)
y_test_pred = nn.predict(X_test)

In [29]:
from sklearn.metrics import r2_score
r2_score(y_train, y_train_pred)

-0.17097429222045113

In [30]:
r2_score(y_test, y_test_pred)

-0.18587038462969518

In [31]:
y_test

array([ 0, 13,  1,  0,  1,  0,  0,  4,  0, 15,  3, 12,  0,  0,  0,  5,  1,
        0,  6, 17,  6,  2,  0,  0,  0,  0, 18,  0,  0, 12,  0,  0,  0,  7,
        0,  0, 50, 38,  0, 97])

In [33]:
y_test_pred

array([[0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552],
       [0.09259552]], dtype=float32)

### Saving the model

In [None]:
# Export our model to HDF5 file
nn.save("DL_trained_model.h5")