# Data preparation

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px

# Athletes

In [4]:
dataset_athlete = pd.read_json('../olympic_athletes.json')
dataset_athlete.head()
dataset_athlete.shape
dataset_athlete.isnull().sum()

athlete_url                 0
athlete_full_name           0
games_participations        0
first_game                 22
athlete_year_birth       2456
athlete_medals          60552
bio                     53062
dtype: int64

# Hotes

In [5]:
dataset_hosts = pd.read_xml('../olympic_hosts.xml')
dataset_hosts.head()
dataset_hosts.shape
dataset_hosts.columns
dataset_hosts.drop(['index'], axis=1, inplace=True)
dataset_hosts.columns
dataset_hosts.isnull().sum()

game_slug          0
game_end_date      0
game_start_date    0
game_location      0
game_name          0
game_season        0
game_year          0
dtype: int64

# Medailles

In [6]:
dataset_medals = pd.read_excel('../olympic_medals.xlsx')
dataset_medals.shape
dataset_medals.isnull().sum()
dataset_medals.duplicated().sum()
dataset_medals.columns
dataset_medals = dataset_medals.rename({'slug_game': 'game_slug'}, axis=1)
dataset_medals.columns
dataset_medals.loc[(dataset_medals.duplicated())]
dataset_medals.loc[(dataset_medals.game_slug == 'paris-1900') & (dataset_medals.discipline_title == 'Polo')]
dataset_medals.loc[(dataset_medals.game_slug == 'london-1908') & (dataset_medals.discipline_title == 'Polo')]
dataset_medals.loc[(dataset_medals.game_slug == 'london-1908') & (dataset_medals.discipline_title == 'Hockey') & (
        dataset_medals.event_title == 'hockey men')]
dataset_medals.participant_title.unique()
dataset_medals.drop(['Unnamed: 0', 'participant_title', 'athlete_url'], axis=1, inplace=True)
dataset_medals.tail()

Unnamed: 0,discipline_title,game_slug,event_title,event_gender,medal_type,participant_type,athlete_full_name,country_name,country_code,country_3_letter_code
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,Viggo JENSEN,Denmark,DK,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,Alexandros Nikolopoulos,Greece,GR,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,Viggo JENSEN,Denmark,DK,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,Launceston ELLIOT,Great Britain,GB,GBR
21696,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,BRONZE,Athlete,Sotirios VERSIS,Greece,GR,GRE


# Results

In [7]:
dataset_results = pd.read_html('../olympic_results.html')[0]
dataset_results.head()
dataset_results.shape
dataset_results.isnull().sum()
dataset_results.duplicated().sum()
dataset_results.loc[(dataset_results.duplicated())]
dataset_results.value_unit.unique()
dataset_results.value_type.unique()
dataset_results.drop(['Unnamed: 0', 'athlete_url'], axis=1, inplace=True)
dataset_results = dataset_results.rename({'slug_game': 'game_slug'}, axis=1)
dataset_results.columns

Index(['discipline_title', 'event_title', 'game_slug', 'participant_type',
       'medal_type', 'athletes', 'rank_equal', 'rank_position', 'country_name',
       'country_code', 'country_3_letter_code', 'athlete_full_name',
       'value_unit', 'value_type'],
      dtype='object')

# Separate winter and summer games

In [8]:
jo_types = dataset_hosts[['game_slug', 'game_season', 'game_year']]
merged_hosts_results = dataset_results.merge(jo_types, on='game_slug')
merged_hosts_results.head()
merged_hosts_results.shape
merged_hosts_results.game_season.unique()

array(['Winter', 'Summer'], dtype=object)

In [9]:
summer_games_results = merged_hosts_results.loc[(merged_hosts_results.game_season == 'Summer')].copy()
winter_games_results = merged_hosts_results.loc[(merged_hosts_results.game_season == 'Winter')].copy()

In [10]:
summer_games_results.drop(['game_season'], axis=1, inplace=True)
summer_games_results.head()
summer_games_results.shape
summer_games_results.isnull().sum()
summer_games_results.duplicated().sum()
dopplers = summer_games_results.loc[(summer_games_results.duplicated())]
dopplers

Unnamed: 0,discipline_title,event_title,game_slug,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_full_name,value_unit,value_type,game_year
154361,Water Polo,Water Polo Women,antwerp-1920,GameTeam,,,,1,Netherlands,NL,NED,,2,SCORE,1920
155336,Sailing,8m mixed,stockholm-1912,GameTeam,,,,5,Russian Federation,RU,RUS,,0,POINTS,1912
157014,Shooting,trap 125 targets men,stockholm-1912,Athlete,GOLD,,,1,United States of America,US,USA,Jay Graham,96,POINTS,1912
157015,Shooting,trap 125 targets men,stockholm-1912,Athlete,SILVER,,,2,Germany,DE,GER,Alfred GOELDEL,94,POINTS,1912
157016,Shooting,trap 125 targets men,stockholm-1912,Athlete,BRONZE,,,3,Russian Federation,RU,RUS,Harry Blaus,91,POINTS,1912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161736,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,,IRM,1900
161737,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,,IRM,1900
161738,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,,IRM,1900
161739,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,,IRM,1900


# Calculate the total number of each type of medals for each games by country

In [11]:
summer_games_results.medal_type.unique()
summer_games_results['medal_type'] = summer_games_results['medal_type'].fillna('None')
summer_games_results['total_medals'] = summer_games_results['medal_type'].apply(lambda x: 0 if x == 'None' else 1)
summer_games_results['gold_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'GOLD' else 0)
summer_games_results['silver_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'SILVER' else 0)
summer_games_results['bronze_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'BRONZE' else 0)
summer_games_results
summer_games_results.country_name.unique()
historic_medalsbycountry = summer_games_results.groupby(['game_year', 'country_name']).agg(
    {'total_medals': 'sum', 'gold_medals': 'sum', 'silver_medals': 'sum', 'bronze_medals': 'sum'})
historic_medalsbycountry = historic_medalsbycountry.sort_values(by=['game_year', 'total_medals'],
                                                                ascending=[True, False]).reset_index()
historic_medalsbycountry

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals
0,1896,Greece,47,10,18,19
1,1896,United States of America,20,11,7,2
2,1896,Germany,13,6,5,2
3,1896,France,11,5,4,2
4,1896,Great Britain,7,2,3,2
...,...,...,...,...,...,...
2884,2020,"Virgin Islands, British",0,0,0,0
2885,2020,"Virgin Islands, US",0,0,0,0
2886,2020,Yemen,0,0,0,0
2887,2020,Zambia,0,0,0,0


# Calculate the total number of disciplines for each games by country

In [12]:
historic_sportsbycountry = summer_games_results.groupby(['game_year', 'country_name', 'discipline_title']).count()
historic_sportsbycountry
historic_sportsbycountry = historic_sportsbycountry[['event_title']].reset_index()
historic_sportsbycountry = historic_sportsbycountry.rename({'discipline_title': 'sports', 'event_title': 'epreuves'},
                                                           axis=1)
historic_sportsbycountry
historic_sportsbycountry = historic_sportsbycountry.groupby(['game_year', 'country_name']).agg({'sports': 'count'})
historic_sportsbycountry = historic_sportsbycountry.reset_index()
historic_sportsbycountry
historic_epreuvesbycountry = summer_games_results.groupby(['game_year', 'country_name', 'event_title']).count()
historic_epreuvesbycountry

historic_epreuvesbycountry = historic_epreuvesbycountry[['discipline_title']].reset_index()
historic_epreuvesbycountry = historic_epreuvesbycountry.rename(
    {'event_title': 'epreuves', 'discipline_title': 'participation'}, axis=1)
historic_epreuvesbycountry = historic_epreuvesbycountry.groupby(['game_year', 'country_name']).agg(
    {'epreuves': 'count'})
historic_epreuvesbycountry = historic_epreuvesbycountry.reset_index()
historic_epreuvesbycountry
historic_olympic_data = historic_medalsbycountry.merge(historic_sportsbycountry, on=['game_year', 'country_name'])
historic_olympic_data = historic_olympic_data.merge(historic_epreuvesbycountry, on=['game_year', 'country_name'])
historic_olympic_data = historic_olympic_data.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])
historic_olympic_data

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves
0,1896,Greece,47,10,18,19,10,39
1,1896,United States of America,20,11,7,2,3,17
2,1896,Germany,13,6,5,2,7,27
3,1896,France,11,5,4,2,7,20
4,1896,Great Britain,7,2,3,2,9,20
...,...,...,...,...,...,...,...,...
2884,2020,"Virgin Islands, British",0,0,0,0,1,2
2885,2020,"Virgin Islands, US",0,0,0,0,1,1
2886,2020,Yemen,0,0,0,0,2,2
2887,2020,Zambia,0,0,0,0,3,5


In [13]:
game_part = summer_games_results.groupby(['country_name', 'game_year']).agg(
    {'total_medals': 'sum', 'gold_medals': 'sum', 'silver_medals': 'sum', 'bronze_medals': 'sum'})
game_part
game_p = game_part.reset_index()
game_p_france = game_p.loc[(game_p.country_name == 'France')]
game_p_france
import warnings

warnings.filterwarnings('ignore')
game_p_france = game_p.loc[(game_p.country_name == 'France')]
game_p_france['game_part'] = range(0, game_p_france.shape[0])
game_p_france['prec_game_medal'] = game_p_france['total_medals'].shift(1, fill_value=0)
game_p_france['prec_game_gold'] = game_p_france['gold_medals'].shift(1, fill_value=0)
game_p_france['prec_game_silver'] = game_p_france['silver_medals'].shift(1, fill_value=0)
game_p_france['prec_game_bronze'] = game_p_france['bronze_medals'].shift(1, fill_value=0)
game_p_france
game_p
liste_country = list(game_p.country_name.unique())
liste_country.remove('France')
liste_country
for country in liste_country:
    game_p_temp = game_p.loc[(game_p.country_name == country)]
    game_p_temp['game_part'] = range(0, game_p_temp.shape[0])
    game_p_temp['prec_game_medal'] = game_p_temp['total_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_gold'] = game_p_temp['gold_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_silver'] = game_p_temp['silver_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_bronze'] = game_p_temp['bronze_medals'].shift(1, fill_value=0)
    game_p_france = pd.concat([game_p_france, game_p_temp])
game_p_france.columns
game_p_france = game_p_france[
    ['game_year', 'country_name', 'total_medals', 'gold_medals', 'silver_medals', 'bronze_medals', 'game_part',
     'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
game_p_france = game_p_france.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])
game_p_france
game_p_france = game_p_france[
    ['game_year', 'country_name', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver',
     'prec_game_bronze']]
game_p_france
historic_olympic_data = pd.merge(historic_olympic_data, game_p_france, on=['game_year', 'country_name'])

historic_olympic_data = historic_olympic_data.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])

# Save the data to ../csv/olympic_data_cleaned.csv

In [14]:
historic_olympic_data.to_csv('../csv/olympic_data_cleaned.csv', index=False)

## Train Dense Neural Network

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from keras import models, layers
import joblib

In [21]:
# Load data
historic_olympic_data = pd.read_csv('../csv/olympic_data_cleaned.csv')

# Split data before 2020
data_before_2020 = historic_olympic_data[historic_olympic_data['game_year'] < 2020]
data_2020 = historic_olympic_data[historic_olympic_data['game_year'] == 2020]

# Features and target variables
features = ['sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']
target = ['gold_medals', 'silver_medals', 'bronze_medals']

# Define the number of epochs and batch size
epochs = 130
batch_size = 32

# Split features and targets
X_train = data_before_2020[features]
y_train = data_before_2020[target]
X_test = data_2020[features]
y_test = data_2020[target]
countries_2020 = data_2020['country_name']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the dense neural network model
model_dense = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(3) # Output layer for gold, silver, and bronze medals
])

# Compile the model
model_dense.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history_dense = model_dense.fit(X_train_scaled, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1)

# Evaluate the model
loss_dense, mae_dense = model_dense.evaluate(X_test_scaled, y_test, verbose=0)

# Make predictions
y_pred_dense = model_dense.predict(X_test_scaled)

# Calculate additional metrics
mse_dense = mean_squared_error(y_test, y_pred_dense)
mae_dense = mean_absolute_error(y_test, y_pred_dense)
r2_dense = r2_score(y_test, y_pred_dense)

print(f'Dense Neural Network - Mean Squared Error: {mse_dense}')
print(f'Dense Neural Network - Mean Absolute Error: {mae_dense}')
print(f'Dense Neural Network - R-squared: {r2_dense}')

# Combine results into a DataFrame for easier comparison
results_dense = pd.DataFrame({
    'Country': countries_2020,
    'Actual Gold': y_test['gold_medals'],
    'Actual Silver': y_test['silver_medals'],
    'Actual Bronze': y_test['bronze_medals'],
    'Predicted Gold': y_pred_dense[:, 0],
    'Predicted Silver': y_pred_dense[:, 1],
    'Predicted Bronze': y_pred_dense[:, 2]
})

# Display the results
print(results_dense)

# Save the scaler
joblib.dump(scaler, '../models/olympic_medals_scaler.pkl')

Epoch 1/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 33.1937 - mae: 1.8276 - val_loss: 3.8904 - val_mae: 0.9763
Epoch 2/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10.4909 - mae: 1.2558 - val_loss: 2.2660 - val_mae: 0.6369
Epoch 3/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 12.0635 - mae: 1.1473 - val_loss: 2.0698 - val_mae: 0.6237
Epoch 4/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10.5840 - mae: 1.1972 - val_loss: 2.1113 - val_mae: 0.6287
Epoch 5/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 13.1083 - mae: 1.2050 - val_loss: 2.2454 - val_mae: 0.6549
Epoch 6/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 13.0420 - mae: 1.1783 - val_loss: 2.3296 - val_mae: 0.6588
Epoch 7/130
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - l

['../models/olympic_medals_scaler.pkl']

In [19]:
# Save the DNN model to ../models/olympic_medals_dnn.h5
model_dense.save('../models/olympic_medals_dnn.h5')
model_dense.save('../models/olympic_medals_dnn.keras')



['../models/olympic_medals_scaler.pkl']