In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("OlympicsExtraction") \
    .master("local[4]") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.driver.memory", "2g") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .config("spark.sql.repl.eagerEval.maxNumRows", 5) \
    .getOrCreate()

athletes = spark.read.csv("/data/raw/athletes.csv", header=True, inferSchema=True)
results = spark.read.csv("/data/raw/results.csv", header=True, inferSchema=True)

athletes


Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,


## Now use pyspark

- Name, height, weight

In [18]:
df_athletes = athletes

df_athletes = (
    df_athletes
    .withColumn('Name', F.regexp_replace(F.col('Used name'), '•', ' '))
    .withColumn('Height_cm', F.regexp_extract(F.col('Measurements'), r'(\d+)\scm', 1).cast('int'))
    .withColumn('Weight_kg', F.regexp_extract(F.col('Measurements'), r'(\d+)\skg', 1).cast('int'))
    
    # Extracting years as integers
    .withColumn('Born_year', F.regexp_extract(F.col('Born'), r'(\d{4})', 1).cast('int'))
    .withColumn('Death_year', F.regexp_extract(F.col('Died'), r'(\d{4})', 1).cast('int'))
    
    # Fixed date pattern for "12 July 1995"
    .withColumn('Born_date', F.regexp_extract(F.col('Born'), r'(\d+\s\w+\s\d{4})', 1))
    .withColumn('Death_date', F.regexp_extract(F.col('Died'), r'(\d+\s\w+\s\d{4})', 1))
    # Fixed typo: Birth_location
    .withColumn('Birth_location', F.regexp_extract(F.col('Born'), r'in\s(.*)', 1))
)

location_map = {
    'City': r'^([\w\s]+),',
    'Region': r',\s([\w\s]+)\s\(',
    'Country': r'\((\w+)\)',
}

for col, pattern in location_map.items():
    df_athletes = df_athletes.withColumn(
        col, F.nullif(F.regexp_extract('Birth_location', pattern, 1), F.lit(""))
    )
    
df_athletes

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,,Jean-François Bla...,,,1886,1960.0,12 December 1886,2 October 1960,"Bordeaux, Gironde...",Bordeaux,Gironde,FRA
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,,Arnaud Boetsch,183.0,76.0,1969,,1 April 1969,,"Meulan, Yvelines ...",Meulan,Yvelines,FRA
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,,Jean Borotra,183.0,76.0,1898,1994.0,13 August 1898,17 July 1994,"Biarritz, Pyrénée...",Biarritz,,FRA
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,,Jacques Brugnon,168.0,64.0,1895,1978.0,11 May 1895,20 March 1978,"Paris VIIIe, Pari...",Paris VIIIe,Paris,FRA
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet,,,1878,1930.0,17 April 1878,25 July 1930,"Wandsworth, Engla...",Wandsworth,England,GBR


Convert to dates to to_date dt

In [21]:
df_athletes = (
    df_athletes
    .withColumn('Born_date', F.to_date(F.col('Born_date'), 'd-MMMM-yyyy'))
    .withColumn('Death_date', F.to_date(F.col('Death_date'), 'd-MMMM-yyyy'))
)

Age

In [25]:
df_athletes = df_athletes.withColumn(
    'Age',
    (F.col('Death_year') - F.col('Born_year')).cast('int')
)

Drop columns

In [11]:
df_athletes = df_athletes.drop(
    'Roles', 
    'Full name', 
    'Used name', 
    'Born', 
    'Died', 
    'Measurements', 
    'Affiliations', 
    'Nick/petnames', 
    'Title(s)', 
    'Other names', 
    'Nationality', 
    'Original name', 
    'Name order',
    'Birth_location'
)

df_athletes

Sex,NOC,athlete_id,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,City,Region,Country,Age
Male,France,1,Jean-François Bla...,,,1886,1960.0,,,Bordeaux,Gironde,FRA,74.0
Male,France,2,Arnaud Boetsch,183.0,76.0,1969,,,,Meulan,Yvelines,FRA,
Male,France,3,Jean Borotra,183.0,76.0,1898,1994.0,,,Biarritz,,FRA,96.0
Male,France,4,Jacques Brugnon,168.0,64.0,1895,1978.0,,,Paris VIIIe,Paris,FRA,83.0
Male,France,5,Albert Canet,,,1878,1930.0,,,Wandsworth,England,GBR,52.0


Will use athlete_id ot merge. Check if all values are integers

In [31]:
# Filter all non integer values
df_malformed_id = df_athletes.filter(F.col('athlete_id').cast('int').isNull())

df_athletes = df_athletes.filter(F.col('athlete_id').cast('int').isNotNull())
# Check
df_malformed_id

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country,Age
Competed in Olymp...,Male,"""James Peter """"Ji...","III""",Jimmy•Montgomery,26 December 1934 ...,13 February 2015 ...,Canada,1215,,,,,,,,"III""",,,,1934.0,,,,,,,
Competed in Olymp...,Male,"""Johannes Jacobus...","Jr.""","Jo•van Gastel, Jr.",5 January 1887 in...,5 March 1969 in T...,Netherlands,2031,,"Nooit Volleerd, T...",,,,,,"Jr.""",,,,1887.0,,,,,,,
Competed in Olymp...,Male,"""Jack Leonard """"J...","Jr.""",Jay•Barrs,17 July 1962 in J...,,United States,2223,182 cm / 70 kg,,,,,,,"Jr.""",,,,1962.0,,,,,,,
Competed in Olymp...,Female,"""Christine Marie ...",-Mill,"-Norman)""",Chris•Evert,21 December 1954 ...,,United States,2725,167 cm / 57 kg,,The Ice Maiden,,,,-Mill,,,,,,,,,,,
Competed in Olymp...,Female,"""Pamela Howard """"...","-Lazenby)""",Pam•Shriver,4 July 1962 in Ba...,,United States,2740,183 cm / 70 kg,,,,,,,"-Lazenby)""",,,,1962.0,,,,,,,


## Uplaod to hadoop
- Save the malformed IDs to analyze later

In [32]:
df_athletes.write.mode('overwrite').parquet("hdfs:///data/clean/athletes")
df_malformed_id.write.mode('overwrite').parquet("hdfs:///data/quarantine")


In [33]:
df_athletes_clean = spark.read.parquet("hdfs:///data/clean/athletes")
df_athletes_clean

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country,Age
Competed in Olymp...,Male,Bernhard•Aschauer,Bernhard•Aschauer,28 May 1945 in Kö...,,West Germany,83715,173 cm / 75 kg,"WSV Königssee, Sc...",,,,,,,Bernhard Aschauer,173.0,75.0,1945,,,,"Königssee, Schöna...",,Bayern,GER,
Competed in Olymp...,Male,Franz•Aschenwald,Franz•Aschenwald,11 January 1913 i...,31 January 1945 i...,Austria,83716,,"Innsbrucker SV, I...",,,,,,,Franz Aschenwald,,,1913,1945.0,,,"Mayrhofen, Tirol ...",Mayrhofen,Tirol,AUT,32.0
Competed in Olymp...,Male,Hansjörg•Aschenwald,Hansjörg•Aschenwald,28 June 1965 in S...,,Austria,83717,,Sportclub Mayrhofen,,,,,,,Hansjörg Aschenwald,,,1965,,,,"Schwaz, Tirol (AUT)",Schwaz,Tirol,AUT,
Competed in Olymp...,Male,Wilhelm•Aschwanden,Wilhelm•Aschwanden,18 December 1969 ...,,Switzerland,83718,181 cm / 82 kg,"SC Marbach, Marba...",,,,,,,Wilhelm Aschwanden,181.0,82.0,1969,,,,Langnau im Emment...,Langnau im Emmental,Bern,SUI,
Competed in Olymp...,Male,Jónas Þórarinn•Ás...,Jónas•Ásgeirsson,25 August 1920 in...,14 June 1996 in K...,Iceland,83719,,Skíðafélag Sigluf...,,,,,,,Jónas Ásgeirsson,,,1920,1996.0,,,"Húsavík, Norðurla...",,,ISL,76.0


## Results

In [41]:
df_results = results

df_results = (
    df_results
    .withColumn('Position', F.trim(F.regexp_replace('Pos', '=', ' ')).cast('int'))
    .withColumn('Games_year', F.regexp_extract('Games', r'(\d{4})', 1).cast('int'))
    .withColumn('Season', F.nullif(F.regexp_extract('Games', r'\b(Summer|Winter|Fall|Spring)\b', 1), F.lit('')))
    .withColumn('Gender', F.nullif(F.regexp_extract('Event', r'\b(Men|Women)\b', 1), F.lit("")))
    .withColumn('Discipline_clean', F.regexp_replace('Discipline', r'\s(\(.*\))', ' '))
    .withColumn('Name', F.regexp_replace('As', '-', ' '))
    .withColumn('Event_clean', F.regexp_extract('Event', r'(.*), ', 1))
)

cols = ['Season', 'Discipline_clean', 'Event_clean']

for c in cols:
    df_results = df_results.withColumn(
        c,
        F.when(F.col(c) == "", None).otherwise(F.col(c))
    ) 

In [42]:
df_results = df_results.withColumn('Medal', F.lower(F.trim(F.col('Medal'))))

df_results = df_results.withColumn(
    'Points',
    F.when(F.col('Medal') == 'gold', 3)
     .when(F.col('Medal') == 'silver', 2)
     .when(F.col('Medal') == 'bronze', 1)
     .otherwise(0).cast('bigint') 
).withColumn(
    'Preformance_result',
    F.when(F.col('Points') > 0, 'Medalist').otherwise('non-medalist')
)

df_results

Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline,Nationality,Unnamed: 7,Position,Games_year,Season,Gender,Discipline_clean,Name,Event_clean,Points,Preformance_result
1912 Summer Olympics,"Singles, Men (Oly...",,=17,,Jean-François Bla...,1,FRA,Tennis,,,17.0,1912,Summer,Men,Tennis,Jean François Bla...,Singles,0,non-medalist
1912 Summer Olympics,"Doubles, Men (Oly...",Jean Montariol,DNS,,Jean-François Bla...,1,FRA,Tennis,,,,1912,Summer,Men,Tennis,Jean François Bla...,Doubles,0,non-medalist
1920 Summer Olympics,"Singles, Men (Oly...",,=32,,Jean-François Bla...,1,FRA,Tennis,,,32.0,1920,Summer,Men,Tennis,Jean François Bla...,Singles,0,non-medalist
1920 Summer Olympics,"Doubles, Mixed (O...",Jeanne Vaussard,=8,,Jean-François Bla...,1,FRA,Tennis,,,8.0,1920,Summer,,Tennis,Jean François Bla...,Doubles,0,non-medalist
1920 Summer Olympics,"Doubles, Men (Oly...",Jacques Brugnon,4,,Jean-François Bla...,1,FRA,Tennis,,,4.0,1920,Summer,Men,Tennis,Jean François Bla...,Doubles,0,non-medalist


In [43]:
df_results = df_results.select(
    'athlete_id', 'Name', 'Gender', 'Discipline_clean', 'Event_clean', 'Medal', 'Points', 'Preformance_Result', 'Position', 'Games_Year', 'Season'
)

In [44]:
df_results.write.mode('overwrite').parquet("hdfs:///data/clean/results")

In [45]:
df_results_clean = spark.read.parquet("hdfs:///data/clean/results")

In [46]:
df_results_clean

athlete_id,Name,Gender,Discipline_clean,Event_clean,Medal,Points,Preformance_Result,Position,Games_Year,Season
39966,Jan Beneš,Men,Rowing,Eights,,0,non-medalist,12.0,1992,Summer
39967,Jindřich Blažek,Men,Rowing,Coxless Fours,,0,non-medalist,4.0,1960,Summer
39968,Petr Blecha,Men,Rowing,Eights,,0,non-medalist,12.0,1992,Summer
39969,Karel Brandstätter,Men,Rowing,Eights,,0,non-medalist,,1936,Summer
39970,Ferdinand Brožek,Men,Rowing,Eights,,0,non-medalist,,1920,Summer


## Merge

In [47]:
columns = ['athlete_id','height_cm', 'weight_kg', 'Born_year', 'Death_year', 'Country']

df_merge = df_results_clean.join(df_athletes_clean.select(columns), on='athlete_id', how='left')

In [49]:
from pyspark.ml.feature import Bucketizer

df_merge = df_merge.withColumn(
    'Age',
    (F.col('Death_year') - F.col('Born_year')).cast('int')
)

splits = [-float("inf"), 13, 20, 30, 40, 50, 60, 70, 80, float("inf")]
labels = {0.0: "11-12", 
          1.0: "13-19", 
          2.0: "20-29", 
          3.0: "30-39", 
          4.0: "40-49", 
          5.0: "50-59", 
          6.0: "60-69", 
          7.0: "70-79", 
          8.0: "80+"
          }

# Apply bucketizer
bucketizer = Bucketizer(splits=splits, inputCol="Age", outputCol="Age_idx", handleInvalid="keep")
df_merge = bucketizer.transform(df_merge)

map = F.create_map([F.lit(x) for i in labels.items() for x in i])

df_merge = df_merge.withColumn("Age_group", map[F.col("Age_idx")])

# Set null to unknown
df_merge = df_merge.withColumn(
    "Age_group", 
    F.when(F.col("Age").isNull(), "Unknown").otherwise(F.col("Age_group"))
)

Calculate BMI

In [51]:
df_merge = df_merge.withColumn(
    'BMI',
    F.round((F.col('Weight_kg') / (F.col('Height_cm') / 100)*2).cast('double'), 2)
)

Olympic Games Year Total Points

In [59]:
df_year_points = df_merge.groupBy('Games_year', 'Age_group').agg(
    F.sum('Points').cast('int').alias('Total_points')
)

In [None]:
df_year_points.write.mode('overwrite').parquet("hdfs:///data/clean/total_year_points")
df_year_points

Games_year,Age_group,Total_points
1988,30-39,29
2004,60-69,3
1976,70-79,143
2016,Unknown,4134
1906,60-69,147


Find podium appearances percentage of the age groups and their disicipline
- Because the age group 20-29 has more participants they will always have more points so here calculate podium appearance percentage

In [61]:
df_podium_appearance_age = df_merge.groupBy('Games_year', 'Age_group', 'Discipline_clean').agg(
    F.count('athlete_id').alias('Total_athletes'),
    F.count('Medal').alias('Podium_appearance')
)

df_podium_appearance_age = df_podium_appearance_age.withColumn(
    '%',
    F.round((F.col('Podium_appearance') / F.col('Total_athletes'))*100, 2)
)
df_podium_appearance_age

Games_year,Age_group,Discipline_clean,Total_athletes,Podium_appearance,%
1988,Unknown,Rowing,594,157,26.43
2004,40-49,Shooting,2,0,0.0
1992,50-59,Volleyball,4,1,25.0
1964,40-49,Water Polo,2,1,50.0
1984,40-49,Weightlifting,3,1,33.33


In [62]:
df_podium_appearance_age.write.mode('overwrite').parquet("hdfs:///data/clean/podium_appearance_age_%")

The mean & std of medalist vs non-medalist in their discipline

In [63]:
df_physical_preformance = df_merge.groupBy('Games_year', 'Discipline_clean', 'Preformance_result').agg(
    F.round(F.mean('Height_cm'), 2).alias('Height_mean'),
    F.round(F.stddev('Height_cm'), 2).alias('Height_std'),
    F.round(F.mean('Weight_kg'), 2).alias('Weight_mean'),
    F.round(F.stddev('Weight_kg'), 2).alias('Weight_std')
)

df_physical_preformance

Games_year,Discipline_clean,Preformance_result,Height_mean,Height_std,Weight_mean,Weight_std
1948,Diving,non-medalist,169.33,5.94,61.0,9.81
1920,Weightlifting,Medalist,167.67,2.08,85.75,26.9
1996,Sailing,Medalist,179.15,8.9,78.21,16.76
1992,Luge,non-medalist,177.88,7.34,79.3,10.27
2002,Figure Skating,Medalist,168.28,9.26,59.94,12.59


In [64]:
df_physical_preformance.write.mode('overwrite').parquet("hdfs:///data/clean/physical_preformance_athlete")