In [67]:
# Import our dependencies
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
# Keras
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

In [68]:
# Import our clean dataset
file_path = Path("../clean_data/combined_olympic_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4
4,Armenia,2912403,3986.231624,0.733,0.299,37,2
5,Australia,23475686,62510.79117,0.935,0.123,80,38
6,Austria,8546356,51717.49594,0.885,0.083,72,17
7,Azerbaijan,9535079,7891.313147,0.751,0.33,29,9
8,Burundi,9844297,274.857948,0.4,0.483,20,0
9,Belgium,11209057,47700.54036,0.89,0.076,76,3


In [69]:
df["total"].value_counts()

0      75
1      18
2       8
3       6
4       5
6       5
8       4
17      3
5       3
9       3
12      3
13      3
7       2
15      2
18      2
38      2
100     1
19      1
16      1
21      1
23      1
31      1
36      1
43      1
44      1
46      1
50      1
63      1
70      1
97      1
132     1
Name: total, dtype: int64

In [70]:
bin_labels = ['0', '1-5', '6-10', '11-30', '31-50', '51+']
num_bins = len(bin_labels)

df['medal_grouping'] = pd.cut(df['total'],
                              bins=[-1, 0, 5, 10, 30, 50, 140],
                              labels=bin_labels)
df['numerical_medal_grouping'] = pd.cut(df['total'],
                              bins=[-1, 0, 5, 10, 30, 50, 140],
                              labels=range(num_bins))

In [71]:
df['medal_grouping'].value_counts()

0        75
1-5      40
11-30    17
6-10     14
31-50     8
51+       5
Name: medal_grouping, dtype: int64

In [72]:
new_df = df.rename(columns={"total": "count_of_medals"})
new_df.head()

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1,1-5,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0,0,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0,0,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4,1-5,1
4,Armenia,2912403,3986.231624,0.733,0.299,37,2,1-5,1


### Split our preprocessed data into our features and target arrays

In [73]:
# Create our target
y = new_df["numerical_medal_grouping"].values
y[:20]

[1, 0, 0, 1, 1, ..., 1, 0, 3, 0, 3]
Length: 20
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]

In [74]:
# Optimizing and transforming features
# Transforming CPI to values between 0-1
new_df["corruption_perceptions_index"] = new_df["corruption_perceptions_index"] / 100
# Transforming GII to invert values. Higher values will now correspond to more gender equality.
new_df["gender_inequality_index"] = 1 - new_df["gender_inequality_index"]
# Replacing GDP per capita with total GDP
new_df["gdp_total"] = new_df["population"] * new_df["gdp_per_capita"]
new_df.head()

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping,gdp_total
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,20484890000.0
1,Albania,2889104,4578.631994,0.733,0.733,0.33,0,0,0,13228140000.0
2,United Arab Emirates,9214175,43751.83889,0.835,0.753,0.7,0,0,0,403137100000.0
3,Argentina,42669500,12334.79825,0.836,0.636,0.34,4,1-5,1,526319700000.0
4,Armenia,2912403,3986.231624,0.733,0.701,0.37,2,1-5,1,11609510000.0


In [75]:
# Create our features
X = new_df.drop(["country_name","count_of_medals", "medal_grouping", "numerical_medal_grouping"], axis=1).values
X

array([[3.33707940e+07, 6.13856689e+02, 4.65000000e-01, 3.24000000e-01,
        1.20000000e-01, 2.04848851e+10],
       [2.88910400e+06, 4.57863199e+03, 7.33000000e-01, 7.33000000e-01,
        3.30000000e-01, 1.32281440e+10],
       [9.21417500e+06, 4.37518389e+04, 8.35000000e-01, 7.53000000e-01,
        7.00000000e-01, 4.03137100e+11],
       [4.26695000e+07, 1.23347982e+04, 8.36000000e-01, 6.36000000e-01,
        3.40000000e-01, 5.26319674e+11],
       [2.91240300e+06, 3.98623162e+03, 7.33000000e-01, 7.01000000e-01,
        3.70000000e-01, 1.16095129e+10],
       [2.34756860e+07, 6.25107912e+04, 9.35000000e-01, 8.77000000e-01,
        8.00000000e-01, 1.46748371e+12],
       [8.54635600e+06, 5.17174959e+04, 8.85000000e-01, 9.17000000e-01,
        7.20000000e-01, 4.41996132e+11],
       [9.53507900e+06, 7.89131315e+03, 7.51000000e-01, 6.70000000e-01,
        2.90000000e-01, 7.52442943e+10],
       [9.84429700e+06, 2.74857948e+02, 4.00000000e-01, 5.17000000e-01,
        2.00000000e-01, 

In [76]:
# Splitting data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape, new_df.shape)

(119, 6) (40, 6) (119,) (40,) (159, 10)


In [77]:
#Create a StandardScaler instances
scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [78]:
# Saving values for result comparison later on
y_test_values = y_test

In [79]:
# We need to convert our target labels (expected values) to categorical data
num_classes = num_bins
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
# Original label of `1` is one-hot encoded as `0100000000`
y_train[0]

array([0., 1., 0., 0., 0., 0.], dtype=float32)

In [80]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train[0])
hidden_nodes_layer1 = 3 * number_input_features
hidden_nodes_layer2 = 2 * number_input_features
hidden_nodes_layer3 = 2 * number_input_features

nn = Sequential()
number_input_features

6

In [81]:
# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=num_classes, activation="softmax"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 18)                126       
_________________________________________________________________
dense_9 (Dense)              (None, 12)                228       
_________________________________________________________________
dense_10 (Dense)             (None, 12)                156       
_________________________________________________________________
dense_11 (Dense)             (None, 6)                 78        
Total params: 588
Trainable params: 588
Non-trainable params: 0
_________________________________________________________________


In [82]:
# Compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

In [83]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=150, shuffle=True)

Train on 119 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150


Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [84]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

40/1 - 0s - loss: 1.0160 - accuracy: 0.5500
Loss: 1.1417712211608886, Accuracy: 0.550000011920929


In [85]:
# Make predictions with scaled test data
y_test_pred = nn.predict(X_test_scaled)

In [86]:
y_test

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1.

In [87]:
y_test_pred.round(1)

array([[0.8, 0.2, 0. , 0. , 0. , 0. ],
       [0.4, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.7, 0.2, 0. , 0. , 0. , 0. ],
       [0.2, 0.4, 0.2, 0.2, 0. , 0. ],
       [0.7, 0.2, 0. , 0. , 0. , 0. ],
       [0.8, 0.2, 0. , 0. , 0. , 0. ],
       [0.4, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.1, 0.2, 0.1, 0.2, 0.4, 0.1],
       [0.1, 0.3, 0.2, 0.3, 0.2, 0. ],
       [0.6, 0.3, 0.1, 0. , 0. , 0. ],
       [0.2, 0.4, 0.2, 0.2, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.4, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.4, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.7, 0.3, 0. , 0. , 0. , 0. ],
       [0.2, 0.4, 0.2, 0.1, 0. , 0. ],
       [0.5, 0.3, 0.1, 0.1, 0. , 0. ],
       [0.1, 0.3, 0.2, 0.3, 0.2, 0. ],
       [0. , 0. , 0. , 0.1, 0.2, 0.7],
       [0.1, 0.3, 0.2, 0.3, 0.2, 0. ],
       [0.7, 0.2, 0. , 0. , 0. , 0. ],
       [0.2, 0.4, 0.2, 0.2, 0. , 0. ],
       [0.6, 0.3, 0.1, 0. , 0. , 0. ],
       [0.5, 0.4, 0.1, 0.

In [88]:
nn.predict_classes(X_test_scaled)

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 4, 3, 0, 1, 0, 1, 0, 0, 1, 0, 1, 5, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 4, 0, 4])

In [89]:
results_df = pd.DataFrame(y_test_values)
results_df['model_results'] = nn.predict_classes(X_test_scaled)

results_df.head(50)

Unnamed: 0,0,model_results
0,0,0
1,3,0
2,1,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,1
8,0,0
9,3,4


### Saving the model

In [90]:
# # Export our model to HDF5 file
# nn.save("nn_trained_categories_model_.h5")

In [91]:
results_df = results_df.rename(columns={0: "numerical_medal_grouping"})
results_df


Unnamed: 0,numerical_medal_grouping,model_results
0,0,0
1,3,0
2,1,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,1
8,0,0
9,3,4


In [92]:
new_df

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping,gdp_total
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10
1,Albania,2889104,4578.631994,0.733,0.733,0.33,0,0,0,1.322814e+10
2,United Arab Emirates,9214175,43751.838890,0.835,0.753,0.70,0,0,0,4.031371e+11
3,Argentina,42669500,12334.798250,0.836,0.636,0.34,4,1-5,1,5.263197e+11
4,Armenia,2912403,3986.231624,0.733,0.701,0.37,2,1-5,1,1.160951e+10
...,...,...,...,...,...,...,...,...,...,...
154,Kosovo,1812771,4080.330717,0.733,0.733,0.33,0,0,0,7.396705e+09
155,Yemen,25823485,1673.146354,0.498,0.243,0.19,0,0,0,4.320647e+10
156,South Africa,54545991,6433.187277,0.666,0.600,0.44,6,6-10,2,3.509046e+11
157,Zambia,15399753,1763.057298,0.586,0.459,0.38,0,0,0,2.715065e+10


In [93]:
#join using the numerical_medal_grouping columns is to use the on parameter.
# new_results_df = new_df.join(results_df.set_index('numerical_medal_grouping'), on='numerical_medal_grouping')

pd.merge(new_df, results_df, on='numerical_medal_grouping', how='outer')
new_results_df

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping,gdp_total,model_results
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,0
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,1
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,1
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,3
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,0
...,...,...,...,...,...,...,...,...,...,...,...
158,Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
158,Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
158,Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
158,Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0


In [94]:
# # new_df.reset_index(inplace=True)
# new_results_df = new_df.join(results_df).head(50)
# new_results_df

# # pd.concat([new_df, results_df], axis=1)

In [95]:
# new_results_df = new_results_df.dropna()

In [96]:
new_results_df.drop(["population","gdp_per_capita","human_development_index","gender_inequality_index","corruption_perceptions_index","gdp_total",], axis=1)

# new_results_df = new_results_df[["country_name",0,"model_results"]]

Unnamed: 0,country_name,count_of_medals,medal_grouping,numerical_medal_grouping,model_results
0,Afghanistan,1,1-5,1,0
0,Afghanistan,1,1-5,1,1
0,Afghanistan,1,1-5,1,1
0,Afghanistan,1,1-5,1,3
0,Afghanistan,1,1-5,1,0
...,...,...,...,...,...
158,Zimbabwe,0,0,0,0
158,Zimbabwe,0,0,0,0
158,Zimbabwe,0,0,0,0
158,Zimbabwe,0,0,0,0


In [61]:
new_results_df = new_results_df.set_index("country_name",drop=True)
new_results_df

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping,gdp_total,model_results
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,0
Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,1
Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,1
Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,3
Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,2.048489e+10,0
...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0
Zimbabwe,13586681,1434.899340,0.509,0.465,0.21,0,0,0,1.949552e+10,0


In [29]:
# new_results_df.to_csv("nn_model_results.csv")