In [3]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

print("Connecting to Hadoop...")
spark = SparkSession.builder \
    .appName("OlympicsExtraction") \
    .master("local[*]") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
    .getOrCreate()

# Read the data from Hadoop
print("Reading files...")
athletes = spark.read.csv("/user/olympics/athletes.csv", header=True, inferSchema=True)
results = spark.read.csv("/user/olympics/results.csv", header=True, inferSchema=True)

# Convert to Pandas
pdf_athletes = athletes.toPandas()
pdf_results = results.toPandas()

# Display the first 5 rows
print("Athletes Data:")
display(pdf_athletes.head())  

print("Results Data:")
display(pdf_results.head())

Connecting to Hadoop...


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/19 18:35:12 WARN Utils: Your hostname, Jonathans-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.17.213.118 instead (on interface en0)
26/02/19 18:35:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/19 18:35:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/19 18:35:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Reading files...


                                                                                

Athletes Data:


Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order
0,Competed in Olympic Games,Male,"""François Joseph Marie Antoine """"Jean-François...",Jean-François•Blanchy,"12 December 1886 in Bordeaux, Gironde (FRA)","2 October 1960 in Saint-Jean-de-Luz, Pyrénées-...",France,1,,,,,,,,
1,Competed in Olympic Games,Male,Arnaud Benjamin•Boetsch,Arnaud•Boetsch,"1 April 1969 in Meulan, Yvelines (FRA)",,France,2,183 cm / 76 kg,"Racing Club de France, Paris (FRA)",,,,,,
2,Competed in Olympic Games • Administrator,Male,Jean Laurent Robert•Borotra,Jean•Borotra,"13 August 1898 in Biarritz, Pyrénées-Atlantiqu...","17 July 1994 in Arbonne, Pyrénées-Atlantiques ...",France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondissant (The Bounding Basque),,,,,
3,Competed in Olympic Games,Male,Jacques Marie Stanislas Jean•Brugnon,Jacques•Brugnon,"11 May 1895 in Paris VIIIe, Paris (FRA)","20 March 1978 in Monaco, Monaco (MON)",France,4,168 cm / 64 kg,"Sporting club de Paris, Paris (FRA)",Toto,,,,,
4,Competed in Olympic Games,Male,Henry Albert•Canet,Albert•Canet,"17 April 1878 in Wandsworth, England (GBR)","25 July 1930 in Paris VIIe, Paris (FRA)",France,5,,"TCP, Paris (FRA)",,,,,,


Results Data:


Unnamed: 0,Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline,Nationality,Unnamed: 7
0,1912 Summer Olympics,"Singles, Men (Olympic)",,=17,,Jean-François Blanchy,1,FRA,Tennis,,
1,1912 Summer Olympics,"Doubles, Men (Olympic)",Jean Montariol,DNS,,Jean-François Blanchy,1,FRA,Tennis,,
2,1920 Summer Olympics,"Singles, Men (Olympic)",,=32,,Jean-François Blanchy,1,FRA,Tennis,,
3,1920 Summer Olympics,"Doubles, Mixed (Olympic)",Jeanne Vaussard,=8,,Jean-François Blanchy,1,FRA,Tennis,,
4,1920 Summer Olympics,"Doubles, Men (Olympic)",Jacques Brugnon,4,,Jean-François Blanchy,1,FRA,Tennis,,


## Cleaning Tasks

- Get rid of bullet points in Used Names
- Split height/weight
- Parse out dates from Born & Died column
- Parse out city, region, and country from Born column
- Get rid of extra column

Get rid of dot

In [3]:
df_athletes = pdf_athletes.copy()

count = df_athletes['Used name'].str.contains('•', na=False).sum()
print(count)

df_athletes['Name'] = df_athletes['Used name'].str.replace('•', ' ')

145115


Use a small data set to clean before applying to athlete dataset

In [43]:
small_data = pd.DataFrame(
    ['180 cm', '183 cm / 67 kg', '/ 65 cm', '68 kg /'], columns=['Measurements']
)
small_data['height_cm'] = small_data['Measurements'].str.extract(r'(\d+)\s*cm')
small_data['weight_kg'] = small_data['Measurements'].str.extract(r'(\d+)\s*kg')


26/02/19 15:11:02 WARN TransportChannelHandler: Exception in connection from 10.0.0.238/10.0.0.238:56818
io.netty.channel.unix.Errors$NativeIoException: readAddress(..) failed with error(-60): Operation timed out


In [23]:
small_data['height_cm'] = pd.to_numeric(small_data['height_cm'])
small_data['weight_kg'] = pd.to_numeric(small_data['weight_kg'])

Split height & weight into differnet columns

In [5]:
df_athletes['height_cm'] = df_athletes['Measurements'].str.extract(r'(\d+)\s*cm')
df_athletes['weight_kg'] = df_athletes['Measurements'].str.extract(r'(\d+)\s*kg')

In [6]:
df_athletes['height_cm'] = pd.to_numeric(df_athletes['height_cm'])
df_athletes['weight_kg'] = pd.to_numeric(df_athletes['weight_kg'])

Seperating date from location

In [7]:
date_pattern = r'(\d+ \w+ \d{4})'
df_athletes['born_date'] = df_athletes['Born'].str.extract(date_pattern)
df_athletes['death_date'] = df_athletes['Died'].str.extract(date_pattern)

Check if all rows follow this pattern
- keep as NaN if it doesnt follow

In [8]:
df_athletes[~df_athletes['Born'].str.match(date_pattern, na=False)]

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,...,Title(s),Other names,Nationality,Original name,Name order,Name,height_cm,weight_kg,born_date,death_date
12,Competed in Olympic Games,Male,J.•Defert,J.•Defert,,,France,13,,"Racing Club de France, Paris (FRA)",...,,,,,,J. Defert,,,,
13,Competed in Olympic Games,Male,Étienne•Durand,Étienne•Durand,,,France,14,,"TCP, Paris (FRA)",...,,,,,,Étienne Durand,,,,
27,Competed in Olympic Games • Competed in Olympi...,Male,Guy•Lejeune,"Guy, Baron•Lejeune",,,France,28,,,...,Baron,,,,,"Guy, Baron Lejeune",,,,
28,Competed in Olympic Games,Male,Albert•Lippmann,Albert•Lippmann,,,France,29,,"unattached, (MIX)",...,,,,,,Albert Lippmann,,,,
91,Competed in Olympic Games,Male,Lionel Hunter•Escombe,Lionel•Escombe,"1876 in Natal, KwaZulu-Natal (RSA)","15 October 1914 in Brentford, England (GBR)",Great Britain,92,,,...,,,,,,Lionel Escombe,,,,15 October 1914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143795,Competed in Olympic Games,Male,"""Jackson Hobson """"Jack""""•Leverett","III""",Jack•Leverett,"18 January 2000 in Bainbridge, Georgia (USA)",,United States,147451,173 cm,...,,,,,,"III""",,,,18 January 2000
143845,Competed in Olympic Games,Male,"""Clarence """"CJ""""•Cummings","Jr.""",CJ•Cummings,"6 June 2000 in Beaufort, South Carolina (USA)",,United States,147501,163 cm / 73 kg,...,,,,,,"Jr.""",,,,6 June 2000
144129,Competed in Olympic Games,Male,René•Van Damme,René•Van Damme,,,Belgium,147802,,,...,,,,,,René Van Damme,,,,
144132,Competed in Olympic Games,Male,Raphael•de Ligne,Raphael•de Ligne,,,Belgium,147805,,,...,,,,,,Raphael de Ligne,,,,


Add a year column because now all rows follow the same date pattern

In [9]:
df_athletes['Born_year'] = df_athletes['Born'].str.extract(r'(\d{4})')
df_athletes['Death_year'] = df_athletes['Died'].str.extract(r'(\d{4})')

Change born & death date into timedate objects

In [10]:
date_columns = ['born_date', 'death_date']
df_athletes[date_columns] = df_athletes[date_columns].apply(pd.to_datetime, errors='coerce')

Parse out dates from 'born' and 'died' columns

In [None]:
# working with a small data set first
birth_data = pd.DataFrame([
    '1876 in Natal, '
    'KwaZulu-Natal (RSA)', 
    '12 December 1886 in Bordeaux, Gironde (FRA)', 
    'in Niort, Deux-Sèvres (FRA)'], 
    columns=['Born']
)

location_pattern = r'in (.*)'
birth_data['birth_loc'] = birth_data['Born'].str.extract(location_pattern)


In [13]:
birth_data

Unnamed: 0,Born,birth_loc
0,"1876 in Natal, KwaZulu-Natal (RSA)","Natal, KwaZulu-Natal (RSA)"
1,"12 December 1886 in Bordeaux, Gironde (FRA)","Bordeaux, Gironde (FRA)"
2,"in Niort, Deux-Sèvres (FRA)","Niort, Deux-Sèvres (FRA)"


Now apply to the athletes dataframe

In [14]:
location_pattern = r'in (.*)'
df_athletes['Birth_location'] = df_athletes['Born'].str.extract(location_pattern)

In [15]:
seperation_pattern = r'([\w\s]+), ([\w\s]+) \((\w+)\)'
df_athletes[['City', 'Region', 'Country']] = df_athletes['Birth_location'].str.extract(seperation_pattern)

Convert columns into its logical datatype

In [16]:
athlete_cols_int = ['athlete_id', 'Born_year', 'Death_year', 'height_cm', 'weight_kg']
df_athletes[athlete_cols_int] = df_athletes[athlete_cols_int].apply(pd.to_numeric, errors='coerce').astype('Int64')

athlete_cols_str = ['Name', 'Birth_location', 'City', 'Region', 'Country', 'NOC']
df_athletes[athlete_cols_str] = df_athletes[athlete_cols_str].astype("string")
df_athletes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 27 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Roles           145500 non-null  object        
 1   Sex             145500 non-null  object        
 2   Full name       145500 non-null  object        
 3   Used name       145500 non-null  object        
 4   Born            143772 non-null  object        
 5   Died            34224 non-null   object        
 6   NOC             145320 non-null  string        
 7   athlete_id      145115 non-null  Int64         
 8   Measurements    107934 non-null  object        
 9   Affiliations    95796 non-null   object        
 10  Nick/petnames   9522 non-null    object        
 11  Title(s)        433 non-null     object        
 12  Other names     7166 non-null    object        
 13  Nationality     8262 non-null    object        
 14  Original name   30728 non-null   obj

In [17]:
columns_keep = ['athlete_id', 'Name', 'Sex', 'NOC', 'height_cm', 'weight_kg', 'born_date', 'death_date', 'Born_year', 'Death_year', 'City', 'Region', 'Country']

df_athletes_clean = df_athletes[columns_keep]

In [18]:
df_athletes_clean

Unnamed: 0,athlete_id,Name,Sex,NOC,height_cm,weight_kg,born_date,death_date,Born_year,Death_year,City,Region,Country
0,1,Jean-François Blanchy,Male,France,,,1886-12-12,1960-10-02,1886,1960,Bordeaux,Gironde,FRA
1,2,Arnaud Boetsch,Male,France,183,76,1969-04-01,NaT,1969,,Meulan,Yvelines,FRA
2,3,Jean Borotra,Male,France,183,76,1898-08-13,1994-07-17,1898,1994,,,
3,4,Jacques Brugnon,Male,France,168,64,1895-05-11,1978-03-20,1895,1978,Paris VIIIe,Paris,FRA
4,5,Albert Canet,Male,France,,,1878-04-17,1930-07-25,1878,1930,Wandsworth,England,GBR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145495,149222,Polina Luchnikova,Female,ROC,167,61,2002-01-30,NaT,2002,,Serov,Sverdlovsk,RUS
145496,149223,Valeriya Merkusheva,Female,ROC,168,65,1999-09-20,NaT,1999,,,,
145497,149224,Yuliya Smirnova,Female,ROC,163,55,1998-05-08,NaT,1998,,Kotlas,Arkhangelsk,RUS
145498,149225,André Foussard,Male,France,166,,1899-05-19,1986-03-18,1899,1986,,,


In [19]:
df_athletes_clean.to_csv('../data/processed/athletes.csv', index=False)

### Clean Results csv

- Remove = and DNS into NA
- Add a seasons and year column
- Parse Male events from Female events

In [4]:
df_results = pdf_results.copy()


In [5]:
# Replcae the = with ' '
df_results['Pos_clean'] = df_results['Pos'].str.replace('=', ' ')

# Convert into a numeric object to turn all DNF and DNS into NAN
df_results['Pos_clean'] = pd.to_numeric(df_results['Pos_clean'], errors='coerce')

Find games that dont match the pattern
- One game spans 2 years
- Some games dont have a season

In [6]:
games_pattern = r'(\d{4} \w+ \w+)'
df_results[~df_results['Games'].str.contains(games_pattern, na=False)]

  df_results[~df_results['Games'].str.contains(games_pattern, na=False)]


Unnamed: 0,Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline,Nationality,Unnamed: 7,Pos_clean
121776,1888-89 Zappas Olympic Games,"Rope Climbing, Men ()",,1,,Sotirios Versis,55911,GRE,Artistic Gymnastics (Gymnastics),,,1.0


In [7]:
# Use the start year of the event
year = r'(\d{4})'
df_results['Games_Year'] = df_results['Games'].str.extract(year)

# Extract the season then set NA to the games without a season
season_pattern = r'\d{4} (\w+)'
df_results['Season'] = df_results['Games'].str.extract(season_pattern)

season_list = ['Winter', 'Summer', 'Fall', 'Spring']
df_results.loc[~df_results['Season'].isin(season_list), 'Season'] = pd.NA

Gender

In [11]:
# Filter out males and females
genders = r'\b(Men|Women)\b'
df_results['Gender'] = df_results['Event'].str.extract(genders)


In [12]:
discipline_pattern = r'\s\(.*\)'
df_results['Discipline_clean'] = df_results['Discipline'].str.replace(discipline_pattern, ' ', regex=True)

In [28]:
df_results['Name'] = df_results['As'].str.replace('-', ' ')

In [None]:
event_pattern = r'(.*), '

df_results['Event_clean'] = df_results['Event'].str.extract(event_pattern)

26/02/19 20:12:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1029017 ms exceeds timeout 120000 ms
26/02/19 20:12:38 WARN SparkContext: Killing executors is not supported by current scheduler.
26/02/19 20:28:16 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:81)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:674)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1324)
	at 

Convert medals into integers

In [30]:
# remove whitespace and change into lowercase to normalize
df_results['Medal'] = df_results['Medal'].str.strip().str.lower()

medal_map = {
    'gold': 3,
    'silver': 2,
    'bronze': 1,
}

df_results['Points'] = df_results['Medal'].map(medal_map).fillna(0)

Medalist or non-medlaist

In [31]:
df_results['Preformance_Result'] = df_results['Points'].apply(lambda x: 'Medalist' if x > 0 else 'non-Medalist')


Remove redundant columns & assign appropriate datatype

In [32]:
results_columns_keep = ['athlete_id', 'Name', 'Gender', 'Discipline_clean', 'Event_clean', 'Medal', 'Points', 'Preformance_Result', 'Pos_clean', 'Games_Year', 'Season']

df_results_clean = df_results[results_columns_keep]

df_results_clean

Unnamed: 0,athlete_id,Name,Gender,Discipline_clean,Event_clean,Medal,Points,Preformance_Result,Pos_clean,Games_Year,Season
0,1,Jean François Blanchy,Men,Tennis,Singles,,0.0,non-Medalist,17.0,1912,Summer
1,1,Jean François Blanchy,Men,Tennis,Doubles,,0.0,non-Medalist,,1912,Summer
2,1,Jean François Blanchy,Men,Tennis,Singles,,0.0,non-Medalist,32.0,1920,Summer
3,1,Jean François Blanchy,,Tennis,Doubles,,0.0,non-Medalist,8.0,1920,Summer
4,1,Jean François Blanchy,Men,Tennis,Doubles,,0.0,non-Medalist,4.0,1920,Summer
...,...,...,...,...,...,...,...,...,...,...,...
308403,148983,Marián Skupek,Men,Luge,Singles,,0.0,non-Medalist,26.0,2022,Winter
308404,148984,Elsa Fermbäck,Women,Alpine Skiing,Slalom,,0.0,non-Medalist,28.0,2022,Winter
308405,148985,Hilma Lövblom,,Alpine Skiing,Team,,0.0,non-Medalist,13.0,2022,Winter
308406,148985,Hilma Lövblom,Women,Alpine Skiing,Giant Slalom,,0.0,non-Medalist,,2022,Winter


In [33]:
results_cols_str = ['Name', 'Gender', 'Discipline_clean', 'Event_clean', 'Medal', 'Season']
df_results_clean[results_cols_str] = df_results_clean[results_cols_str].astype('string')

results_cols_int = ['athlete_id', 'Points', 'Pos_clean', 'Games_Year']
df_results_clean[results_cols_int] = df_results_clean[results_cols_int].apply(pd.to_numeric, errors='coerce').astype('Int64')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_results_clean[results_cols_str] = df_results_clean[results_cols_str].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_results_clean[results_cols_int] = df_results_clean[results_cols_int].apply(pd.to_numeric, errors='coerce').astype('Int64')


In [34]:
df_results_clean.to_csv('../data/processed/results.csv')

Group into age groups
- Merge df_athletes and df_results together

In [35]:

columns = ['athlete_id','height_cm', 'weight_kg', 'Born_year', 'Death_year', 'Country']


# convert athlete_is to the same type as athlete table
df_results_clean['athlete_id'] = df_results_clean['athlete_id'].astype('Int64')
df_merge = df_results_clean.merge(df_athletes_clean[columns], on='athlete_id', how='left')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_results_clean['athlete_id'] = df_results_clean['athlete_id'].astype('Int64')


Find the age groups of athletes

In [None]:
df_merge['Age'] = df_merge['Games_Year'] - df_merge['Born_year']
df_merge['Age'] = pd.to_numeric(df_merge['Age'], errors='coerce')

# Youngest age 11 and oldest 73
df_merge['Age'].min()
df_merge['Age'].max()

# Any Na
df_merge[df_merge['Age'].isna()]

# Group them accordingly
age_bin = [13, 20, 30, 40, 50, 60, 70, 80]

age_groups = np.array(['11-12', '13-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79'])

index = np.digitize(df_merge['Age'].fillna(-1), age_bin)

df_merge['Age_group'] = age_groups[index]

# For the NA ages change the gorup to unknown
df_merge.loc[df_merge['Age'].isna(), 'Age_group'] = 'Unknown'

df_merge['athlete_id'].is


In [38]:
df_merge[df_merge['Age_group'] == 'Unknown']

Unnamed: 0,athlete_id,Name,Gender,Discipline_clean,Event_clean,Medal,Points,Preformance_Result,Pos_clean,Games_Year,Season,height_cm,weight_kg,Born_year,Death_year,Country,Age,Age_group
50,13,J. Defert,Men,Tennis,Singles,,0,non-Medalist,8,1896,Summer,,,,,,,Unknown
51,13,J. Defert,Men,Tennis,Doubles,,0,non-Medalist,,1896,Summer,,,,,,,Unknown
52,14,Étienne Durand,Men,Tennis,Singles,,0,non-Medalist,8,1900,Summer,,,,,,,Unknown
53,14,Étienne Durand,Men,Tennis,Doubles,,0,non-Medalist,5,1900,Summer,,,,,,,Unknown
54,14,Étienne Durand,Men,Tennis,"Singles, Handicap",,0,non-Medalist,9,1900,Summer,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306050,147451,Jack Leverett,Men,Shooting,"Rapid-Fire Pistol, 25 metres",,0,non-Medalist,25,2020,Summer,,,,,,,Unknown
306120,147501,CJ Cummings,Men,Weightlifting,Middleweight,,0,non-Medalist,9,2020,Summer,,,,,,,Unknown
306437,147802,René Van Damme,Men,Rowing,Coxed Pairs,,0,non-Medalist,,1920,Summer,,,,,,,Unknown
306445,147805,Raphael de Ligne,Men,Rowing,Coxed Fours,,0,non-Medalist,,1920,Summer,,,,,,,Unknown


Calculate BMI

In [39]:
df_merge['BMI'] = (df_merge['weight_kg'] / (df_merge['height_cm'] / 100)**2)

Find the points for each age group for each Olympic year
- Store it in a seperate df because if we have 1000 athletes in a certain age group for a year it will repeate it 1000 times

In [40]:
# Store in seperate df
df_year_total_points = df_merge.groupby(['Games_Year', 'Age_group'])['Points'].sum().reset_index()

df_year_total_points.to_csv('../data/processed/year_total_points.csv')

Find podium appearances percentage of the age groups and their disicipline
- Because the age group 20-29 has more participants they will always have more points so here calculate podium appearance percentage

In [41]:
# Count by medal since points has values of 0
df_podium_appearance_age = df_merge.groupby(['Games_Year', 'Age_group', 'Discipline_clean']).agg({
    'athlete_id' : 'count',
    'Medal' : 'count'
}).reset_index()

df_podium_appearance_age['Appearance_%'] = ((df_podium_appearance_age['Medal'] / df_podium_appearance_age['athlete_id']) * 100).round(2)

# Rename for clarity
df_podium_appearance_age = df_podium_appearance_age.rename(columns={
    'Games_Year' : 'Year',
    'Age_group' : 'Age_Group',
    'athlete_id' : 'Total_Athletes',
    'Medal' :'Podium_Appearances',
})
df_podium_appearance_age
df_podium_appearance_age.to_csv('../data/processed/podium_appearances_age.csv')

The mean & std of medalist vs non-medalist

In [42]:
df_physical_preformance = df_merge.groupby(['Discipline_clean', 'Preformance_Result']).agg({
    'height_cm': ['mean', 'std'],
    'weight_kg': ['mean', 'std'],
}).reset_index()

df_physical_preformance.to_csv('../data/processed/physical_preformance.csv')
df_physical_preformance.head()


Unnamed: 0_level_0,Discipline_clean,Preformance_Result,height_cm,height_cm,weight_kg,weight_kg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std
0,3-on-3 Ice Hockey,Medalist,175.846154,7.625026,69.076923,9.962108
1,3-on-3 Ice Hockey,non-Medalist,173.1875,10.546524,66.5625,10.449681
2,3x3 Basketball,Medalist,187.0,7.81025,90.5,20.506097
3,3x3 Basketball,non-Medalist,186.416667,9.671498,81.8,9.418068
4,Alpine Skiing,Medalist,173.99345,7.892741,73.21542,11.883194
