In [1]:
# Import our dependencies
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

In [2]:
# Import our clean dataset
file_path = Path("clean_data/combined_olympic_data.csv")
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,country_name,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
0,Afghanistan,33370794,613.856689,0.465,0.676,12,1
1,Albania,2889104,4578.631994,0.733,0.267,33,0
2,United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
3,Argentina,42669500,12334.79825,0.836,0.364,34,4
4,Armenia,2912403,3986.231624,0.733,0.299,37,2
5,Australia,23475686,62510.79117,0.935,0.123,80,38
6,Austria,8546356,51717.49594,0.885,0.083,72,17
7,Azerbaijan,9535079,7891.313147,0.751,0.33,29,9
8,Burundi,9844297,274.857948,0.4,0.483,20,0
9,Belgium,11209057,47700.54036,0.89,0.076,76,3


In [3]:
df = df.set_index("country_name",drop=True)
df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,total
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


In [4]:
new_df = df.rename(columns={"total": "count_of_medals"})
new_df.head()

Unnamed: 0_level_0,population,gdp_per_capita,human_development_index,gender_inequality_index,corruption_perceptions_index,count_of_medals
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,33370794,613.856689,0.465,0.676,12,1
Albania,2889104,4578.631994,0.733,0.267,33,0
United Arab Emirates,9214175,43751.83889,0.835,0.247,70,0
Argentina,42669500,12334.79825,0.836,0.364,34,4
Armenia,2912403,3986.231624,0.733,0.299,37,2


### Split our preprocessed data into our features and target arrays

In [5]:
# Create our target
y = new_df["count_of_medals"].values

In [6]:
# Create our features
X = new_df.drop(["count_of_medals"], axis=1).values

In [7]:
# Splitting data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15)

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Training Random Forest Regression model 
rf_model = RandomForestRegressor(n_estimators=128, random_state=15)
# Fitting the model
rf_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=128, random_state=15)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([1.40625000e-01, 7.25000000e+00, 4.93750000e+00, 1.56250000e-02,
       7.44531250e+00, 2.50000000e-01, 7.81250000e-03, 1.16406250e+00,
       1.82031250e+00, 2.31015625e+01, 1.48046875e+01, 5.54687500e-01,
       2.60156250e+00, 4.10937500e+00, 5.51562500e+00, 8.90625000e-01,
       3.98437500e-01, 1.61718750e+00, 5.57031250e+00, 7.82031250e+00,
       7.96796875e+01, 1.44453125e+01, 5.15625000e-01, 7.31250000e+00,
       5.85156250e+00, 7.25000000e+00, 3.89843750e+00, 4.17187500e+00,
       5.39843750e+00, 6.39843750e+00, 7.81250000e-03, 4.39843750e+00,
       2.34375000e-02, 5.57031250e+00, 3.90625000e-02, 1.25000000e-01,
       3.48750000e+01, 3.79531250e+01, 2.53125000e+00, 1.30781250e+01])

In [11]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances


array([0.60158498, 0.03197819, 0.17599934, 0.11536234, 0.07507516])

In [12]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
y_pred

array([ 2.3359375,  2.3359375,  2.515625 ,  2.3359375, 11.2578125,
        2.3359375,  2.3359375,  9.765625 ,  2.953125 , 15.9921875,
       15.9921875,  2.515625 ,  9.765625 ,  2.3359375, 11.0546875,
        9.765625 ,  2.3359375, 12.640625 ,  2.4765625, 15.9921875,
        2.3359375, 15.9921875,  2.3359375, 15.9921875,  2.3359375,
        6.09375  , 11.53125  ,  2.3359375,  3.0078125,  9.765625 ,
        2.3359375,  2.3359375,  2.3359375, 10.0390625,  2.3359375,
        2.3359375, 15.9921875, 15.9921875,  2.328125 ,  9.765625 ])

In [13]:
print(f" Random forest predictive accuracy: {rf_model.score(X_test_scaled, y_test):.3f}")

 Random forest predictive accuracy: 0.130


### Saving the model

In [14]:
joblib.dump(rf_model,"trained_rf_model.joblib")

['trained_rf_model.joblib']