In [1]:
# Import our dependencies
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
# Keras
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

In [2]:
# Import our clean dataset
file_path = Path("../clean_data/combined_olympic_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4
4,Armenia,2912403,3986.231624,0.733,0.299,37,2
5,Australia,23475686,62510.79117,0.935,0.123,80,38
6,Austria,8546356,51717.49594,0.885,0.083,72,17
7,Azerbaijan,9535079,7891.313147,0.751,0.33,29,9
8,Burundi,9844297,274.857948,0.4,0.483,20,0
9,Belgium,11209057,47700.54036,0.89,0.076,76,3


In [3]:
df["total"].value_counts()

0      75
1      18
2       8
3       6
4       5
6       5
8       4
17      3
5       3
9       3
12      3
13      3
7       2
15      2
18      2
38      2
100     1
19      1
16      1
21      1
23      1
31      1
36      1
43      1
44      1
46      1
50      1
63      1
70      1
97      1
132     1
Name: total, dtype: int64

In [4]:
bin_labels = ['0', '1-5', '6-10', '11-30', '31-50', '51+']
num_bins = len(bin_labels)

df['medal_grouping'] = pd.cut(df['total'],
                              bins=[-1, 0, 5, 10, 30, 50, 140],
                              labels=bin_labels)
df['numerical_medal_grouping'] = pd.cut(df['total'],
                              bins=[-1, 0, 5, 10, 30, 50, 140],
                              labels=range(num_bins))

In [5]:
df['medal_grouping'].value_counts()

0        75
1-5      40
11-30    17
6-10     14
31-50     8
51+       5
Name: medal_grouping, dtype: int64

In [6]:
new_df = df.rename(columns={"total": "count_of_medals"})
new_df.head()

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1,1-5,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0,0,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0,0,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4,1-5,1
4,Armenia,2912403,3986.231624,0.733,0.299,37,2,1-5,1


### Split our preprocessed data into our features and target arrays

In [7]:
# Create our target
y = new_df["numerical_medal_grouping"].values
y[:20]

[1, 0, 0, 1, 1, ..., 1, 0, 3, 0, 3]
Length: 20
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]

In [8]:
# Optimizing and transforming features
# Transforming CPI to values between 0-1
new_df["corruption_perceptions_index"] = new_df["corruption_perceptions_index"] / 100
# Transforming GII to invert values. Higher values will now correspond to more gender equality.
new_df["gender_inequality_index"] = 1 - new_df["gender_inequality_index"]
# Replacing GDP per capita with total GDP
new_df["gdp_total"] = new_df["population"] * new_df["gdp_per_capita"]
new_df.head()

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals,medal_grouping,numerical_medal_grouping,gdp_total
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,1,1-5,1,20484890000.0
1,Albania,2889104,4578.631994,0.733,0.733,0.33,0,0,0,13228140000.0
2,United Arab Emirates,9214175,43751.83889,0.835,0.753,0.7,0,0,0,403137100000.0
3,Argentina,42669500,12334.79825,0.836,0.636,0.34,4,1-5,1,526319700000.0
4,Armenia,2912403,3986.231624,0.733,0.701,0.37,2,1-5,1,11609510000.0


In [9]:
# Create our features
X = new_df.drop(["count_of_medals", "medal_grouping", "numerical_medal_grouping"], axis=1)
X

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,gdp_total
0,Afghanistan,33370794,613.856689,0.465,0.324,0.12,2.048489e+10
1,Albania,2889104,4578.631994,0.733,0.733,0.33,1.322814e+10
2,United Arab Emirates,9214175,43751.838890,0.835,0.753,0.70,4.031371e+11
3,Argentina,42669500,12334.798250,0.836,0.636,0.34,5.263197e+11
4,Armenia,2912403,3986.231624,0.733,0.701,0.37,1.160951e+10
...,...,...,...,...,...,...,...
154,Kosovo,1812771,4080.330717,0.733,0.733,0.33,7.396705e+09
155,Yemen,25823485,1673.146354,0.498,0.243,0.19,4.320647e+10
156,South Africa,54545991,6433.187277,0.666,0.600,0.44,3.509046e+11
157,Zambia,15399753,1763.057298,0.586,0.459,0.38,2.715065e+10


In [10]:
# Splitting data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape, new_df.shape)

(119, 7) (40, 7) (119,) (40,) (159, 10)


In [11]:
#Create a StandardScaler instances
scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train.drop(["country_name"], axis = 1))

# Scale the data
X_train_scaled = X_scaler.transform(X_train.drop(["country_name"], axis = 1))
X_test_scaled = X_scaler.transform(X_test.drop(["country_name"], axis = 1))


In [12]:
X_test

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,gdp_total
157,Zambia,15399753,1763.057298,0.586,0.459,0.38,27150650000.0
63,Iran,77465753,5585.526758,0.766,0.486,0.27,432687000000.0
40,Algeria,38923687,5493.02559,0.736,0.571,0.36,213808800000.0
31,Congo Republic,4736974,2996.918026,0.591,0.406,0.23,14196320000.0
117,Portugal,10401062,22074.30076,0.83,0.897,0.63,229596200000.0
74,Cambodia,15274503,1093.496191,0.555,0.518,0.21,16702610000.0
130,Sao Tome and Principe,195727,1782.798059,0.555,0.473,0.42,348941700.0
142,Trinidad and Tobago,1362342,20270.85937,0.772,0.659,0.38,27615840000.0
111,Peru,30090359,6672.880255,0.734,0.611,0.38,200789400000.0
25,Switzerland,8188649,86605.56338,0.93,0.955,0.86,709182600000.0


In [13]:
# Saving values for result comparison later on
y_test_values = y_test

In [14]:
# We need to convert our target labels (expected values) to categorical data
num_classes = num_bins
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
# Original label of `1` is one-hot encoded as `0100000000`
y_train[0]

array([0., 1., 0., 0., 0., 0.], dtype=float32)

In [15]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train.drop(["country_name"],axis=1).columns)
hidden_nodes_layer1 = 3 * number_input_features
hidden_nodes_layer2 = 2 * number_input_features
hidden_nodes_layer3 = 2 * number_input_features

nn = Sequential()
number_input_features

6

In [16]:
# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=num_classes, activation="softmax"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 18)                126       
_________________________________________________________________
dense_1 (Dense)              (None, 12)                228       
_________________________________________________________________
dense_2 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 78        
Total params: 588
Trainable params: 588
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

In [18]:
# Train the model
X_train_scaled.reshape(-1,1)
y_train.reshape(-1,1)
fit_model = nn.fit(X_train_scaled, y_train, epochs=150, shuffle=True)

Train on 119 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150


Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

40/1 - 0s - loss: 1.0074 - accuracy: 0.6000
Loss: 1.1386943459510803, Accuracy: 0.6000000238418579


In [20]:
# Make predictions with scaled test data
y_test_pred = nn.predict(X_test_scaled)

In [21]:
y_test

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1.

In [22]:
y_test_pred.round(1)

array([[0.8, 0.2, 0. , 0. , 0. , 0. ],
       [0.4, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.8, 0.2, 0. , 0. , 0. , 0. ],
       [0.2, 0.4, 0.2, 0.2, 0. , 0. ],
       [0.7, 0.2, 0. , 0. , 0. , 0. ],
       [0.8, 0.1, 0. , 0. , 0. , 0. ],
       [0.3, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0. , 0.1, 0.1, 0.1, 0.6, 0.1],
       [0.1, 0.3, 0.2, 0.3, 0.1, 0. ],
       [0.6, 0.3, 0.1, 0. , 0. , 0. ],
       [0.3, 0.4, 0.2, 0.2, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.1, 0. , 0. ],
       [0.7, 0.2, 0. , 0. , 0. , 0. ],
       [0.2, 0.5, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.3, 0.1, 0.1, 0. , 0. ],
       [0.1, 0.2, 0.2, 0.3, 0.1, 0. ],
       [0. , 0. , 0. , 0. , 0.2, 0.7],
       [0.1, 0.3, 0.2, 0.3, 0.1, 0. ],
       [0.8, 0.2, 0. , 0. , 0. , 0. ],
       [0.2, 0.5, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.3, 0.1, 0.1, 0. , 0. ],
       [0.5, 0.4, 0.1, 0.

In [23]:
nn.predict_classes(X_test_scaled)

array([0, 1, 0, 0, 1, 0, 0, 1, 0, 4, 1, 0, 1, 0, 0, 0, 0, 1, 0, 3, 5, 3,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 4, 0, 4])

In [24]:
results_df = pd.DataFrame(y_test_values)
y_pred = nn.predict_classes(X_test_scaled)
# results_df['model_results'] = nn.predict_classes(X_test_scaled)
# results_df = nn.predict_classes(X_test_scaled)
# results_df.head(50)

### Saving the model

In [25]:
# # Export our model to HDF5 file
# nn.save("nn_trained_categories_model_.h5")

In [26]:
# new_df.reset_index(inplace=True)
new_results_df["country_name"] =  X_test["country_name"]
new_results_df["predictions"] = y_pred
new_results_df

# new_results_df = new_df.join(results_df).head(50)
# pd.concat([new_df, results_df], axis=1)

NameError: name 'new_results_df' is not defined

In [27]:
new_results_df = new_results_df.dropna()

In [28]:
new_results_df.drop(["population","gdp_per_capita","human_development_index","gender_inequality_index","corruption_perceptions_index","gdp_total",], axis=1)

# new_results_df = new_results_df[["country_name",0,"model_results"]]

Unnamed: 0,country_name,count_of_medals,medal_grouping,numerical_medal_grouping,0,model_results
0,Afghanistan,1,1-5,1,0,0.0
1,Albania,0,0,0,3,1.0
2,United Arab Emirates,0,0,0,1,1.0
3,Argentina,4,1-5,1,0,0.0
4,Armenia,2,1-5,1,1,1.0
5,Australia,38,31-50,4,0,0.0
6,Austria,17,11-30,3,0,0.0
7,Azerbaijan,9,6-10,2,1,1.0
8,Burundi,0,0,0,0,1.0
9,Belgium,3,1-5,1,3,4.0


In [28]:
new_results_df.set_index("country_name",drop=True)

Unnamed: 0_level_0,0,model_results
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0,0.0
Albania,3,0.0
United Arab Emirates,1,0.0
Argentina,0,0.0
Armenia,1,1.0
Australia,0,0.0
Austria,0,0.0
Azerbaijan,1,1.0
Burundi,0,0.0
Belgium,3,4.0


In [29]:
# new_results_df.to_csv("nn_model_results.csv")