In [122]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("OlympicsExtraction") \
    .master("local[*]") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .config("spark.sql.repl.eagerEval.maxNumRows", 5) \
    .getOrCreate()

athletes = spark.read.csv("/data/raw/athletes.csv", header=True, inferSchema=True)
results = spark.read.csv("/data/raw/results.csv", header=True, inferSchema=True)

athletes


                                                                                

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,


## Now use pyspark

- Name, height, weight

In [127]:
df_athletes = athletes

df_athletes = (
    df_athletes
    .withColumn('Name', F.regexp_replace('Used name', '•', ' '))
    .withColumn('Height_cm', F.regexp_extract('Measurements', r'(\d+)\scm', 1).try_cast('int'))
    .withColumn('Weight_kg', F.regexp_extract('Measurements', r'(\d+)\skg', 1).try_cast('int'))
    
    # Extracting years as integers
    .withColumn('Born_year', F.regexp_extract('Born', r'(\d{4})', 1).try_cast('int'))
    .withColumn('Death_year', F.regexp_extract('Died', r'(\d{4})', 1).try_cast('int'))
    
    # Fixed date pattern for "12 July 1995"
    .withColumn('Born_date', F.regexp_extract('Born', r'(\d+\s\w+\s\d{4})', 1))
    .withColumn('Death_date', F.regexp_extract('Died', r'(\d+\s\w+\s\d{4})', 1))
    # Fixed typo: Birth_location
    .withColumn('Birth_location', F.regexp_extract('Born', r'in\s(.*)', 1))
    
    # Splitting location into City, Region, and Country
    .withColumn('City', F.regexp_extract('Birth_location', r'^([\w\s]+),', 1))
    .withColumn('Region', F.regexp_extract('Birth_location', r',\s([\w\s]+)\s\(', 1))
    .withColumn('Country', F.regexp_extract('Birth_location', r'\((\w+)\)', 1))
)
df_athletes

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,,Jean-François Bla...,,,1886,1960.0,12 December 1886,2 October 1960,"Bordeaux, Gironde...",Bordeaux,Gironde,FRA
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,,Arnaud Boetsch,183.0,76.0,1969,,1 April 1969,,"Meulan, Yvelines ...",Meulan,Yvelines,FRA
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,,Jean Borotra,183.0,76.0,1898,1994.0,13 August 1898,17 July 1994,"Biarritz, Pyrénée...",Biarritz,,FRA
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,,Jacques Brugnon,168.0,64.0,1895,1978.0,11 May 1895,20 March 1978,"Paris VIIIe, Pari...",Paris VIIIe,Paris,FRA
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet,,,1878,1930.0,17 April 1878,25 July 1930,"Wandsworth, Engla...",Wandsworth,England,GBR


Convert to dates to to_date dt

In [76]:
df_athletes = (
    df_athletes
    .withColumn('Born_date', F.try_to_date(F.col('Born_date'), 'd-MMMM-yyyy'))
    .withColumn('Death_date', F.try_to_date(F.col('Death_date'), 'd-MMMM-yyyy'))
)

Age

In [77]:
df_athletes = df_athletes.withColumn(
    'Age',
    F.col('Death_year') - F.col('Born_year')
)
df_athletes

Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country,Age
Competed in Olymp...,Male,"""François Joseph ...",Jean-François•Bla...,12 December 1886 ...,2 October 1960 in...,France,1,,,,,,,,,Jean-François Bla...,,,1886,1960.0,,,"Bordeaux, Gironde...",Bordeaux,Gironde,FRA,74.0
Competed in Olymp...,Male,Arnaud Benjamin•B...,Arnaud•Boetsch,1 April 1969 in M...,,France,2,183 cm / 76 kg,Racing Club de Fr...,,,,,,,Arnaud Boetsch,183.0,76.0,1969,,,,"Meulan, Yvelines ...",Meulan,Yvelines,FRA,
Competed in Olymp...,Male,Jean Laurent Robe...,Jean•Borotra,13 August 1898 in...,17 July 1994 in A...,France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondiss...,,,,,,Jean Borotra,183.0,76.0,1898,1994.0,,,"Biarritz, Pyrénée...",Biarritz,,FRA,96.0
Competed in Olymp...,Male,Jacques Marie Sta...,Jacques•Brugnon,11 May 1895 in Pa...,20 March 1978 in ...,France,4,168 cm / 64 kg,Sporting club de ...,Toto,,,,,,Jacques Brugnon,168.0,64.0,1895,1978.0,,,"Paris VIIIe, Pari...",Paris VIIIe,Paris,FRA,83.0
Competed in Olymp...,Male,Henry Albert•Canet,Albert•Canet,17 April 1878 in ...,25 July 1930 in P...,France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet,,,1878,1930.0,,,"Wandsworth, Engla...",Wandsworth,England,GBR,52.0


Drop columns

In [78]:
df_athletes = df_athletes.drop(
    'Roles', 
    'Full name', 
    'Used name', 
    'Born', 
    'Died', 
    'Measurements', 
    'Affiliations', 
    'Nick/petnames', 
    'Title(s)', 
    'Other names', 
    'Nationality', 
    'Original name', 
    'Name order'
)

df_athletes

Sex,NOC,athlete_id,Name,Height_cm,Weight_kg,Born_year,Death_year,Born_date,Death_date,Birth_location,City,Region,Country,Age
Male,France,1,Jean-François Bla...,,,1886,1960.0,,,"Bordeaux, Gironde...",Bordeaux,Gironde,FRA,74.0
Male,France,2,Arnaud Boetsch,183.0,76.0,1969,,,,"Meulan, Yvelines ...",Meulan,Yvelines,FRA,
Male,France,3,Jean Borotra,183.0,76.0,1898,1994.0,,,"Biarritz, Pyrénée...",Biarritz,,FRA,96.0
Male,France,4,Jacques Brugnon,168.0,64.0,1895,1978.0,,,"Paris VIIIe, Pari...",Paris VIIIe,Paris,FRA,83.0
Male,France,5,Albert Canet,,,1878,1930.0,,,"Wandsworth, Engla...",Wandsworth,England,GBR,52.0


Uplaod to hadoop

In [80]:
df_athletes.write.mode('overwrite').parquet("hdfs:/data/clean/athletes")

df_cleaned = spark.read.parquet("hdfs:///data/clean/athletes")
df_cleaned.show(5)

                                                                                

+------+----+----------+-----------------+---------+---------+---------+----------+---------+----------+--------------------+----+-------------------+-------+----+
|   Sex| NOC|athlete_id|             Name|Height_cm|Weight_kg|Born_year|Death_year|Born_date|Death_date|      Birth_location|City|             Region|Country| Age|
+------+----+----------+-----------------+---------+---------+---------+----------+---------+----------+--------------------+----+-------------------+-------+----+
|Female|Cuba|    109163|   Nancy Carrillo|      190|       74|     1986|      NULL|     NULL|      NULL|La Habana (Havana...|    |Ciudad de La Habana| Havana|NULL|
|Female|Cuba|    109164|Maybelis Martínez|      178|       78|     1977|      NULL|     NULL|      NULL|                    |    |                   |       |NULL|
|Female|Cuba|    109165|       Liana Mesa|      182|       73|     1977|      NULL|     NULL|      NULL|Camagüey, Camagüe...|    |                   |    CUB|NULL|
|Female|Cuba|   

## Results

In [128]:
df_results = results

df_results = (
    df_results
    .withColumn('Position', F.trim(F.regexp_replace('Pos', '=', ' ')).try_cast('int'))
    .withColumn('Games_year', F.regexp_extract('Games', r'(\d{4})', 1).try_cast('int'))
    .withColumn('Season', F.nullif(F.regexp_extract('Games', r'\b(Summer|Winter|Fall|Spring)\b', 1), F.lit('')))
    .withColumn('Gender', F.nullif(F.regexp_extract('Event', r'\b(Men|Women)\b', 1), F.lit("")))
    .withColumn('Discipline_clean', F.regexp_replace('Discipline', r'\s\(.*\)', 1))
    .withColumn('Name', F.regexp_replace('As', '-', ' '))
    .withColumn('Event_clean', F.regexp_extract('Event', r'(.*), ', 1))
    
)

In [129]:
df_results = df_results.withColumn('Medal', F.lower(F.trim(F.col('Medal'))))

df_results = df_results.withColumn(
    'Points',
    F.when(F.col('Medal') == 'gold', 3)
     .when(F.col('Medal') == 'silver', 2)
     .when(F.col('Medal') == 'bronze', 1)
     .otherwise(0).cast('bigint') 
).withColumn(
    'Preformance_result',
    F.when(F.col('Points') > 0, 'Medalist').otherwise('non-medalist')
)


In [130]:
df_results = df_results.select(
    'athlete_id', 'Name', 'Gender', 'Discipline_clean', 'Event_clean', 'Medal', 'Points', 'Preformance_Result', 'Position', 'Games_Year', 'Season'
)

In [131]:
df_results.write.mode('overwrite').parquet("hdfs:/data/clean/results")

26/02/19 19:37:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

26/02/19 20:46:14 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2014441 ms exceeds timeout 120000 ms
26/02/19 20:46:14 WARN SparkContext: Killing executors is not supported by current scheduler.
26/02/19 20:46:16 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

## Merge

In [None]:
columns = ['athlete_id','height_cm', 'weight_kg', 'Born_year', 'Death_year', 'Country']