In [1]:
# Import Dependencies
import pandas as pd

In [2]:
# Import datasets
athlete_counts_df = pd.read_csv('CSV_In_Progress/competitor_count.csv')
population_df = pd.read_csv('Clean_Country_population.csv')
gdp_df = pd.read_csv('CSV_In_Progress/clean_gdp_final.csv')
medal_count_df = pd.read_csv('Olympic_Medal_Counts.csv')

In [3]:
# CLean athlete counts df
athlete_counts_df = athlete_counts_df.rename(columns={'CompetitorID' : 'athlete_count'})
athlete_counts_df.head()

Unnamed: 0.1,Unnamed: 0,NOC,Year,Season,Host,athlete_count
0,0,AFG,1936,Summer,0,4
1,1,AFG,1948,Summer,0,2
2,2,AFG,1956,Summer,0,1
3,3,AFG,1960,Summer,0,13
4,4,AFG,1964,Summer,0,8


In [4]:
# Dataframe with Olympic years after 1959
olympic_years_df = athlete_counts_df[['Year', 'Season']]
olympic_years_df = olympic_years_df.drop_duplicates()
olympic_years_df = olympic_years_df[olympic_years_df['Year'] > 1959]
olympic_years_df

Unnamed: 0,Year,Season
3,1960,Summer
4,1964,Summer
5,1968,Summer
6,1972,Summer
7,1980,Summer
8,1988,Summer
9,1996,Summer
10,2004,Summer
11,2008,Summer
12,2012,Summer


In [5]:
# Clean population dataframe
population_df = population_df.drop(['Primary Key', 'Country Name'], axis=1)
population_df.head()

Unnamed: 0,Country Code,Year,Population
0,ARU,1960,54208
1,AFG,1960,8996967
2,ANG,1960,5454938
3,ALB,1960,1608800
4,AND,1960,13410


In [6]:
# Clean gdp dataset
gdp_df = gdp_df.drop(['Primary Key', 'Entity'], axis=1)
gdp_df.head()

Unnamed: 0,Code,Year,GDP per capita
0,AFG,1950,1156.0
1,AFG,1951,1170.0
2,AFG,1952,1189.0
3,AFG,1953,1240.0
4,AFG,1954,1245.0


In [7]:
# Clean Medal Count dataset
medal_count_df = medal_count_df.drop(['Primary Key', 'Nation'], axis=1)
medal_count_df.head()

Unnamed: 0,Year,Season,NOC Code,Gold,Silver,Bronze,Total
0,2008,Summer,AFG,0,0,1,1
1,2012,Summer,AFG,0,0,1,1
2,1984,Summer,ALG,0,0,2,2
3,1992,Summer,ALG,1,0,1,2
4,1996,Summer,ALG,2,0,1,3


In [8]:
# Merge athlete counts and population dataframes
count_and_pop_df = pd.merge(athlete_counts_df, population_df, how='left', left_on=['NOC', 'Year'], right_on=['Country Code', 'Year'])
count_and_pop_df = count_and_pop_df.drop(['Country Code'], axis=1)
count_and_pop_df

Unnamed: 0.1,Unnamed: 0,NOC,Year,Season,Host,athlete_count,Population
0,0,AFG,1936,Summer,0,4,
1,1,AFG,1948,Summer,0,2,
2,2,AFG,1956,Summer,0,1,
3,3,AFG,1960,Summer,0,13,8996967.0
4,4,AFG,1964,Summer,0,8,9744772.0
...,...,...,...,...,...,...,...
3868,3832,ZIM,2004,Summer,0,13,12019911.0
3869,3833,ZIM,2008,Summer,0,16,12379553.0
3870,3834,ZIM,2012,Summer,0,9,13115149.0
3871,3835,ZIM,2014,Winter,0,2,13586710.0


In [9]:
# Merge count_and_pop_df with gdp
count_and_gdp_df = pd.merge(count_and_pop_df, gdp_df, how='left', left_on=['NOC', 'Year'], right_on=['Code', 'Year'])
count_and_gdp_df = count_and_gdp_df.drop(['Code'], axis=1)
count_and_gdp_df

Unnamed: 0.1,Unnamed: 0,NOC,Year,Season,Host,athlete_count,Population,GDP per capita
0,0,AFG,1936,Summer,0,4,,
1,1,AFG,1948,Summer,0,2,,
2,2,AFG,1956,Summer,0,1,,1278.00
3,3,AFG,1960,Summer,0,13,8996967.0,1326.00
4,4,AFG,1964,Summer,0,8,9744772.0,1291.00
...,...,...,...,...,...,...,...,...
3868,3832,ZIM,2004,Summer,0,13,12019911.0,1604.50
3869,3833,ZIM,2008,Summer,0,16,12379553.0,1197.53
3870,3834,ZIM,2012,Summer,0,9,13115149.0,1604.00
3871,3835,ZIM,2014,Winter,0,2,13586710.0,1594.00


In [10]:
# reset dataframe with olympic years
count_and_gdp_year_df = pd.merge(olympic_years_df, count_and_gdp_df, how="left", on="Year")
count_and_gdp_year_df

Unnamed: 0.1,Year,Season_x,Unnamed: 0,NOC,Season_y,Host,athlete_count,Population,GDP per capita
0,1960,Summer,3,AFG,Summer,0,13,8996967.0,1326.0
1,1960,Summer,15,AHO,Summer,0,5,,
2,1960,Summer,112,ARG,Summer,0,78,20481781.0,8861.0
3,1960,Summer,113,ARG,Winter,0,14,20481781.0,8861.0
4,1960,Summer,186,AUS,Summer,0,201,10276477.0,14013.0
...,...,...,...,...,...,...,...,...,...
4593,1972,Winter,3730,VEN,Summer,0,27,12097696.0,15135.0
4594,1972,Winter,3768,VNM,Summer,0,2,,
4595,1972,Winter,3799,YUG,Summer,0,162,,
4596,1972,Winter,3800,YUG,Winter,0,12,,


In [11]:
# Generate list of years with the olympics
olympic_years = athlete_counts_df['Year'].to_list()
olympic_year_list = []
[olympic_year_list.append(x) for x in olympic_years if x not in olympic_year_list]
olympic_year_list

[1936,
 1948,
 1956,
 1960,
 1964,
 1968,
 1972,
 1980,
 1988,
 1996,
 2004,
 2008,
 2012,
 2016,
 1952,
 1976,
 1984,
 1992,
 2000,
 2006,
 2010,
 2014,
 1994,
 1998,
 2002,
 1908,
 1912,
 1900,
 1920,
 1924,
 1928,
 1932,
 1896,
 1904,
 1906]

In [12]:
# Remove years without the olympics and drop NaN NOC and Country Code
count_and_gdp_filter_df = count_and_gdp_df[count_and_gdp_df['Year'].isin(olympic_year_list)]
count_and_gdp_filter_df

Unnamed: 0.1,Unnamed: 0,NOC,Year,Season,Host,athlete_count,Population,GDP per capita
0,0,AFG,1936,Summer,0,4,,
1,1,AFG,1948,Summer,0,2,,
2,2,AFG,1956,Summer,0,1,,1278.00
3,3,AFG,1960,Summer,0,13,8996967.0,1326.00
4,4,AFG,1964,Summer,0,8,9744772.0,1291.00
...,...,...,...,...,...,...,...,...
3868,3832,ZIM,2004,Summer,0,13,12019911.0,1604.50
3869,3833,ZIM,2008,Summer,0,16,12379553.0,1197.53
3870,3834,ZIM,2012,Summer,0,9,13115149.0,1604.00
3871,3835,ZIM,2014,Winter,0,2,13586710.0,1594.00


In [19]:
# Merge count_and_gdp_df with medal count
count_and_medal_df = pd.merge(count_and_gdp_filter_df, medal_count_df, how='left', left_on=['NOC', 'Year', 'Season'], right_on=['NOC Code', 'Year', 'Season'])
count_and_medal_df = count_and_medal_df.drop(['NOC Code', 'Unnamed: 0'], axis=1)
count_and_medal_df

Unnamed: 0,NOC,Year,Season,Host,athlete_count,Population,GDP per capita,Gold,Silver,Bronze,Total
0,AFG,1936,Summer,0,4,,,,,,
1,AFG,1948,Summer,0,2,,,,,,
2,AFG,1956,Summer,0,1,,1278.00,,,,
3,AFG,1960,Summer,0,13,8996967.0,1326.00,,,,
4,AFG,1964,Summer,0,8,9744772.0,1291.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3868,ZIM,2004,Summer,0,13,12019911.0,1604.50,1.0,1.0,1.0,3.0
3869,ZIM,2008,Summer,0,16,12379553.0,1197.53,1.0,3.0,0.0,4.0
3870,ZIM,2012,Summer,0,9,13115149.0,1604.00,,,,
3871,ZIM,2014,Winter,0,2,13586710.0,1594.00,,,,


In [20]:
# Filter out years before 1960 and fill NaN
ML_df = count_and_medal_df[count_and_medal_df['Year'] > 1959]
ML_df[['Gold', 'Silver', 'Bronze', 'Total']] = ML_df[['Gold', 'Silver', 'Bronze', 'Total']].fillna(value=0)
ML_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,NOC,Year,Season,Host,athlete_count,Population,GDP per capita,Gold,Silver,Bronze,Total
3,AFG,1960,Summer,0,13,8996967.0,1326.0,0.0,0.0,0.0,0.0
4,AFG,1964,Summer,0,8,9744772.0,1291.0,0.0,0.0,0.0,0.0
5,AFG,1968,Summer,0,5,10637064.0,1290.0,0.0,0.0,0.0,0.0
6,AFG,1972,Summer,0,8,11791222.0,1007.0,0.0,0.0,0.0,0.0
7,AFG,1980,Summer,0,11,13356500.0,1019.0,0.0,0.0,0.0,0.0


In [24]:
ML_df = ML_df.dropna()

In [25]:
ML_df = ML_df.reset_index()

In [26]:
ML_df.describe()

Unnamed: 0,level_0,index,Year,Host,athlete_count,Population,GDP per capita,Gold,Silver,Bronze,Total
count,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0
mean,1549.97483,1911.259289,1993.105074,0.010787,55.694367,41996630.0,13538.390635,1.43348,1.463044,1.617259,4.513783
std,882.637787,1087.664699,16.27594,0.10332,83.199012,142554600.0,13634.703571,4.585395,3.920591,4.010345,12.027335
min,0.0,3.0,1960.0,0.0,1.0,63261.0,0.0,0.0,0.0,0.0,0.0
25%,781.5,936.5,1980.0,0.0,7.0,3783898.0,3250.32,0.0,0.0,0.0,0.0
50%,1545.0,1931.0,1996.0,0.0,20.0,9527985.0,9199.0,0.0,0.0,0.0,0.0
75%,2285.5,2810.5,2008.0,0.0,67.0,29093930.0,20032.72,1.0,1.0,1.0,3.0
max,3147.0,3872.0,2016.0,1.0,555.0,1387790000.0,156299.0,83.0,61.0,38.0,174.0


In [27]:
ML_df.to_csv('CSV_In_Progress/MLR_dataset.csv')