In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("OlympicsExtraction") \
    .master("local[4]") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.driver.memory", "2g") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .config("spark.sql.repl.eagerEval.maxNumRows", 5) \
    .getOrCreate()

athletes = spark.read.csv("/data/raw/athletes.csv", header=True, inferSchema=True)
results = spark.read.csv("/data/raw/results.csv", header=True, inferSchema=True)

athletes


26/02/21 23:31:42 WARN FileSystem: Failed to initialize filesystem hdfs://namenode:9000: java.lang.IllegalArgumentException: java.net.UnknownHostException: namenode
26/02/21 23:31:42 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /data/raw/athletes.csv.
java.lang.IllegalArgumentException: java.net.UnknownHostException: namenode
	at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:479)
	at org.apache.hadoop.hdfs.NameNodeProxiesClient.createProxyWithClientProtocol(NameNodeProxiesClient.java:134)
	at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:370)
	at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:309)
	at org.apache.hadoop.hdfs.DistributedFileSystem.initDFSClient(DistributedFileSystem.java:205)
	at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:190)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3615)
	at org.apache.hadoop.fs

IllegalArgumentException: java.net.UnknownHostException: namenode

## Now use pyspark

- Name, height, weight

In [111]:
df_athletes = athletes

df_athletes = (
    df_athletes
    .withColumn('Name', F.regexp_replace('Used name', '•', ' '))
    .withColumn('Height_cm', F.regexp_extract('Measurements', r'(\d+)\scm', 1).try_cast('int'))
    .withColumn('Weight_kg', F.regexp_extract('Measurements', r'(\d+)\skg', 1).try_cast('int'))
    
    # Extracting years as integers
    .withColumn('Born_year', F.regexp_extract('Born', r'(\d{4})', 1).try_cast('int'))
    .withColumn('Death_year', F.regexp_extract('Died', r'(\d{4})', 1).try_cast('int'))
    
    # Fixed date pattern for "12 July 1995"
    .withColumn('Born_date', F.regexp_extract('Born', r'(\d+\s\w+\s\d{4})', 1))
    .withColumn('Death_date', F.regexp_extract('Died', r'(\d+\s\w+\s\d{4})', 1))
    # Fixed typo: Birth_location
    .withColumn('Birth_location', F.regexp_extract('Born', r'in\s(.*)', 1))
)

location_map = {
    'City': r'^([\w\s]+),',
    'Region': r',\s([\w\s]+)\s\(',
    'Country': r'\((\w+)\)',
}

for col, pattern in location_map.items():
    df_athletes = df_athletes.withColumn(
        col, F.nullif(F.regexp_extract('Birth_location', pattern, 1), F.lit(""))
    )
    
df_athletes

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,,Jean-François Bla...,,,1886,1960.0,12 December 1886,2 October 1960,"Bordeaux, Gironde...",Bordeaux,Gironde,FRA
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,,Arnaud Boetsch,183.0,76.0,1969,,1 April 1969,,"Meulan, Yvelines ...",Meulan,Yvelines,FRA
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,,Jean Borotra,183.0,76.0,1898,1994.0,13 August 1898,17 July 1994,"Biarritz, Pyrénée...",Biarritz,,FRA
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,,Jacques Brugnon,168.0,64.0,1895,1978.0,11 May 1895,20 March 1978,"Paris VIIIe, Pari...",Paris VIIIe,Paris,FRA
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet,,,1878,1930.0,17 April 1878,25 July 1930,"Wandsworth, Engla...",Wandsworth,England,GBR


Convert to dates to to_date dt

In [112]:
df_athletes = (
    df_athletes
    .withColumn('Born_date', F.try_to_date(F.col('Born_date'), 'd-MMMM-yyyy'))
    .withColumn('Death_date', F.try_to_date(F.col('Death_date'), 'd-MMMM-yyyy'))
)

Age

In [113]:
df_athletes = df_athletes.withColumn(
    'Age',
    F.col('Death_year') - F.col('Born_year')
)

Drop columns

In [114]:
df_athletes = df_athletes.drop(
    'Roles', 
    'Full name', 
    'Used name', 
    'Born', 
    'Died', 
    'Measurements', 
    'Affiliations', 
    'Nick/petnames', 
    'Title(s)', 
    'Other names', 
    'Nationality', 
    'Original name', 
    'Name order',
    'Birth_location'
)

df_athletes

Sex,NOC,athlete_id,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,City,Region,Country,Age
Male,France,1,Jean-François Bla...,,,1886,1960.0,,,Bordeaux,Gironde,FRA,74.0
Male,France,2,Arnaud Boetsch,183.0,76.0,1969,,,,Meulan,Yvelines,FRA,
Male,France,3,Jean Borotra,183.0,76.0,1898,1994.0,,,Biarritz,,FRA,96.0
Male,France,4,Jacques Brugnon,168.0,64.0,1895,1978.0,,,Paris VIIIe,Paris,FRA,83.0
Male,France,5,Albert Canet,,,1878,1930.0,,,Wandsworth,England,GBR,52.0


Will use athlete_id ot merge. Check if all values are integers

In [115]:
# Filter all non integer values
df_malformed_id = df_athletes.filter(F.col('athlete_id').try_cast('int').isNull())

df_athletes = df_athletes.filter(F.col('athlete_id').try_cast('int').isNotNull())
# Check
df_malformed_id

Sex,NOC,athlete_id,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,City,Region,Country,Age
Male,13 February 2015 ...,Canada,"III""",,,,1934.0,,,,,,
Male,5 March 1969 in T...,Netherlands,"Jr.""",,,,1887.0,,,,,,
Male,,United States,"Jr.""",,,,1962.0,,,,,,
Female,21 December 1954 ...,,-Mill,,,,,,,,,,
Female,,United States,"-Lazenby)""",,,,1962.0,,,,,,


## Uplaod to hadoop
- Save the malformed IDs to analyze later

In [116]:
df_athletes.write.mode('overwrite').parquet("hdfs:///data/clean/athletes")
df_malformed_id.write.mode('overwrite').parquet("hdfs:///data/quarantine")


                                                                                

In [117]:
df_athletes_clean = spark.read.parquet("hdfs:///data/clean/athletes")
df_athletes_clean

Sex,NOC,athlete_id,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,City,Region,Country,Age
Male,Switzerland,87293,Henry Höhnes,,,1889,,,,,,,
Male,Czechoslovakia,87294,Rudolf Höhnl,158.0,71.0,1946,,,,Lomazice,,CZE,
Male,Austria,87295,Gregor Höll,165.0,,1911,1999.0,,,,Salzburg,AUT,88.0
Male,Austria,87296,Rudolf Höll,,,1911,1984.0,,,Kraubath an der Mur,Steiermark,AUT,73.0
Male,West Germany,87297,Stefan Hölzlwimmer,174.0,86.0,1951,,,,Salzberg,Bayern,GER,


## Results

In [118]:
df_results = results

df_results = (
    df_results
    .withColumn('Position', F.trim(F.regexp_replace('Pos', '=', ' ')).try_cast('int'))
    .withColumn('Games_year', F.regexp_extract('Games', r'(\d{4})', 1).try_cast('int'))
    .withColumn('Season', F.nullif(F.regexp_extract('Games', r'\b(Summer|Winter|Fall|Spring)\b', 1), F.lit('')))
    .withColumn('Gender', F.nullif(F.regexp_extract('Event', r'\b(Men|Women)\b', 1), F.lit("")))
    .withColumn('Discipline_clean', F.regexp_replace('Discipline', r'\s\(.*\)', 1))
    .withColumn('Name', F.regexp_replace('As', '-', ' '))
    .withColumn('Event_clean', F.regexp_extract('Event', r'(.*), ', 1))
)

cols = ['Season', 'Discipline_clean', 'Event_clean']

for c in cols:
    df_results = df_results.withColumn(
        c,
        F.when(F.col(c) == "", None).otherwise(F.col(c))
    ) 

In [119]:
df_results = df_results.withColumn('Medal', F.lower(F.trim(F.col('Medal'))))

df_results = df_results.withColumn(
    'Points',
    F.when(F.col('Medal') == 'gold', 3)
     .when(F.col('Medal') == 'silver', 2)
     .when(F.col('Medal') == 'bronze', 1)
     .otherwise(0).cast('bigint') 
).withColumn(
    'Preformance_result',
    F.when(F.col('Points') > 0, 'Medalist').otherwise('non-medalist')
)

df_results

Games,Event,Team,Pos,Medal,As,athlete_id,NOC,Discipline,Nationality,Unnamed: 7,Position,Games_year,Season,Gender,Discipline_clean,Name,Event_clean,Points,Preformance_result
1912 Summer Olympics,"Singles, Men (Oly...",,=17,,Jean-François Bla...,1,FRA,Tennis,,,17.0,1912,Summer,Men,Tennis,Jean François Bla...,Singles,0,non-medalist
1912 Summer Olympics,"Doubles, Men (Oly...",Jean Montariol,DNS,,Jean-François Bla...,1,FRA,Tennis,,,,1912,Summer,Men,Tennis,Jean François Bla...,Doubles,0,non-medalist
1920 Summer Olympics,"Singles, Men (Oly...",,=32,,Jean-François Bla...,1,FRA,Tennis,,,32.0,1920,Summer,Men,Tennis,Jean François Bla...,Singles,0,non-medalist
1920 Summer Olympics,"Doubles, Mixed (O...",Jeanne Vaussard,=8,,Jean-François Bla...,1,FRA,Tennis,,,8.0,1920,Summer,,Tennis,Jean François Bla...,Doubles,0,non-medalist
1920 Summer Olympics,"Doubles, Men (Oly...",Jacques Brugnon,4,,Jean-François Bla...,1,FRA,Tennis,,,4.0,1920,Summer,Men,Tennis,Jean François Bla...,Doubles,0,non-medalist


In [120]:
df_results = df_results.select(
    'athlete_id', 'Name', 'Gender', 'Discipline_clean', 'Event_clean', 'Medal', 'Points', 'Preformance_Result', 'Position', 'Games_Year', 'Season'
)

In [121]:
df_results.write.mode('overwrite').parquet("hdfs:///data/clean/results")

26/02/21 18:30:09 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [122]:
df_results_clean = spark.read.parquet("hdfs:///data/clean/results")

In [123]:
df_results_clean

athlete_id,Name,Gender,Discipline_clean,Event_clean,Medal,Points,Preformance_Result,Position,Games_Year,Season
120662,BJ Lawrence,Men,Athletics,100 metres,,0,non-medalist,,2016,Summer
120663,Robert Lindstedt,,Tennis,Doubles,,0,non-medalist,9.0,2012,Summer
120663,Robert Lindstedt,Men,Tennis,Doubles,,0,non-medalist,9.0,2012,Summer
120664,Mariya Baklakova,Women,Swimming1,4 × 200 metres Fr...,,0,non-medalist,,2012,Summer
120665,Mariya Gromova,Women,Swimming1,4 × 100 metres Me...,,0,non-medalist,,2012,Summer


## Merge

In [124]:
columns = ['athlete_id','height_cm', 'weight_kg', 'Born_year', 'Death_year', 'Country']

df_merge = df_results_clean.join(df_athletes_clean.select(columns), on='athlete_id', how='left')

In [125]:
from pyspark.ml.feature import Bucketizer

df_merge = df_merge.withColumn(
    'Age',
    (F.col('Death_year') - F.col('Born_year')).try_cast('int')
)

splits = [-float("inf"), 13, 20, 30, 40, 50, 60, 70, 80, float("inf")]
labels = {0.0: "11-12", 
          1.0: "13-19", 
          2.0: "20-29", 
          3.0: "30-39", 
          4.0: "40-49", 
          5.0: "50-59", 
          6.0: "60-69", 
          7.0: "70-79", 
          8.0: "80+"
          }

# Apply bucketizer
bucketizer = Bucketizer(splits=splits, inputCol="Age", outputCol="Age_idx", handleInvalid="keep")
df_merge = bucketizer.transform(df_merge)

map = F.create_map([F.lit(x) for i in labels.items() for x in i])

df_merge = df_merge.withColumn("Age_group", map[F.col("Age_idx")])

# Set null to unknown
df_merge = df_merge.withColumn(
    "Age_group", 
    F.when(F.col("Age").isNull(), "Unknown").otherwise(F.col("Age_group"))
)

Calculate BMI

In [136]:
df_merge = df_merge.withColumn(
    'BMI',
    F.round((F.col('Weight_kg') / (F.col('Height_cm') / 100)*2).try_cast('double'), 2)
)

26/02/21 18:54:34 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:131)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:707)
	at org.apache.spark.storage.BlockManagerMasterE

Olympic Games Year Total Points

In [129]:
df_year_points = df_merge.groupBy('Games_year', 'Age_group').agg(
    F.sum('Points').alias('Total_points')
)

In [130]:
df_year_points.write.mode('overwrite').parquet("hdfs:///data/clean/total_year_points")
df_year_points

Games_year,Age_group,Total_points
2016,Unknown,4134
1906,60-69,147
1900,Unknown,220
1924,30-39,31
2010,20-29,3


Find podium appearances percentage of the age groups and their disicipline
- Because the age group 20-29 has more participants they will always have more points so here calculate podium appearance percentage

In [135]:
df_podium_appearance_age = df_merge.groupBy('Games_year', 'Age_group', 'Discipline_clean').agg(
    F.count('athlete_id').alias('Total_athletes'),
    F.count('Medal').alias('Podium_appearance')
)

df_podium_appearance_age = df_podium_appearance_age.withColumn(
    '%',
    F.round((F.col('Podium_appearance') / F.col('Total_athletes'))*100, 2)
)
df_podium_appearance_age

                                                                                

Games_year,Age_group,Discipline_clean,Total_athletes,Podium_appearance,%
1988,Unknown,Rowing,594,157,26.43
1996,Unknown,Football1,385,97,25.19
2020,Unknown,Softball1,90,45,50.0
1906,60-69,Diving1,22,3,13.64
2020,Unknown,Wrestling,271,69,25.46


In [144]:
df_podium_appearance_age.write.mode('overwrite').parquet("hdfs:///data/clean/podium_appearance_age_%")

                                                                                

The mean & std of medalist vs non-medalist in their discipline

In [143]:
df_physical_preformance = df_merge.groupBy('Games_year', 'Discipline_clean', 'Preformance_result').agg(
    F.round(F.mean('Height_cm'), 2).alias('Height_mean'),
    F.round(F.stddev('Height_cm'), 2).alias('Height_std'),
    F.round(F.mean('Weight_kg'), 2).alias('Weight_mean'),
    F.round(F.stddev('Weight_kg'), 2).alias('Weight_std')
)

df_physical_preformance

Games_year,Discipline_clean,Preformance_result,Height_mean,Height_std,Weight_mean,Weight_std
2020,Marathon Swimming1,non-medalist,174.17,9.79,65.23,10.77
2018,Short Track Speed...,non-medalist,170.97,7.97,64.55,8.7
2020,Biathlon,non-medalist,170.21,8.93,57.55,8.2
2018,Diving1,non-medalist,174.0,0.0,63.0,0.0
2020,Freestyle Skiing1,non-medalist,174.87,7.21,66.46,8.95


26/02/21 19:06:14 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:131)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:707)
	at org.apache.spark.storage.BlockManagerMasterE

In [145]:
df_physical_preformance.write.mode('overwrite').parquet("hdfs:///data/clean/physical_preformance_athlete")

                                                                                

26/02/21 19:12:55 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:81)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:674)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1324)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:322)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1941