In [0]:
dbutils.secrets.help()

In [0]:
# Secrets created in Azure key vault

dbutils.secrets.list('DB-Scope')

[SecretMetadata(key='Application-Client-ID'),
 SecretMetadata(key='Directory-Tenant-ID'),
 SecretMetadata(key='Object-ID'),
 SecretMetadata(key='secret-key')]

In [0]:
# run only if already mounted
dbutils.fs.unmount("/mnt/olympicdata")

/mnt/olympicdata has been unmounted.


True

In [0]:
#Basic connection configuration for databricks to ADLS

Application_Client_ID = dbutils.secrets.get('DB-Scope', 'Application-Client-ID')
Directory_Tenant_ID = dbutils.secrets.get('DB-Scope', 'Directory-Tenant-ID')
Object_ID = dbutils.secrets.get('DB-Scope', 'Object-ID')
secret_key = dbutils.secrets.get('DB-Scope', 'secret-key')

configs = {"fs.azure.account.auth.type": "OAuth",
"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
"fs.azure.account.oauth2.client.id": Application_Client_ID,
"fs.azure.account.oauth2.client.secret": secret_key,
"fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{Directory_Tenant_ID}/oauth2/token"}

dbutils.fs.mount(
    source="abfss://olympicdata@tokyoolympicnirmal.dfs.core.windows.net/", # container@storageAccountName
    mount_point="/mnt/olympicdata",
    extra_configs=configs
)

True

In [0]:
%fs
ls "/mnt/olympicdata"

path,name,size,modificationTime
dbfs:/mnt/olympicdata/raw-data/,raw-data/,0,1732430731000
dbfs:/mnt/olympicdata/transformed-data/,transformed-data/,0,1732430743000


In [0]:
athletes = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/olympicdata/raw-data/athletes.csv")
coaches = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/olympicdata/raw-data/coaches.csv")
entriesGender = spark.read.format("csv").option("header", "true").load("/mnt/olympicdata/raw-data/entriesGender.csv")
medals = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/olympicdata/raw-data/medals.csv")
teams = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/olympicdata/raw-data/teams.csv")

In [0]:
medals.limit(10).display()

Rank,TeamCountry,Gold,Silver,Bronze,Total,Rank by Total
1,United States of America,39,41,33,113,1
2,People's Republic of China,38,32,18,88,2
3,Japan,27,14,17,58,5
4,Great Britain,22,21,22,65,4
5,ROC,20,28,23,71,3
6,Australia,17,7,22,46,6
7,Netherlands,10,12,14,36,9
8,France,10,12,11,33,10
9,Germany,10,11,16,37,8
10,Italy,10,10,20,40,7


In [0]:
medals.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- TeamCountry: string (nullable = true)
 |-- Gold: integer (nullable = true)
 |-- Silver: integer (nullable = true)
 |-- Bronze: integer (nullable = true)
 |-- Total: integer (nullable = true)
 |-- Rank by Total: integer (nullable = true)



In [0]:
res = medals.select("Rank").where(medals["TeamCountry"] == "India")
res.show()

+----+
|Rank|
+----+
|  48|
+----+



In [0]:
athletes.limit(10).display()

PersonName,Country,Discipline
AALERUD Katrine,Norway,Cycling Road
ABAD Nestor,Spain,Artistic Gymnastics
ABAGNALE Giovanni,Italy,Rowing
ABALDE Alberto,Spain,Basketball
ABALDE Tamara,Spain,Basketball
ABALO Luc,France,Handball
ABAROA Cesar,Chile,Rowing
ABASS Abobakr,Sudan,Swimming
ABBASALI Hamideh,Islamic Republic of Iran,Karate
ABBASOV Islam,Azerbaijan,Wrestling


In [0]:
athletes.select("Country").distinct().show()

+--------------------+
|             Country|
+--------------------+
|Islamic Republic ...|
|            Malaysia|
|           Singapore|
|             Germany|
|            Maldives|
|               Sudan|
|              France|
|             Belgium|
|               Qatar|
|               Chile|
|               Italy|
|              Norway|
|               Spain|
|United States of ...|
|           Indonesia|
|          Azerbaijan|
|        Saudi Arabia|
|          Uzbekistan|
|           Australia|
|               Egypt|
+--------------------+
only showing top 20 rows



In [0]:
entriesGender.show()

+--------------------+------+----+-----+
|          Discipline|Female|Male|Total|
+--------------------+------+----+-----+
|      3x3 Basketball|    32|  32|   64|
|             Archery|    64|  64|  128|
| Artistic Gymnastics|    98|  98|  196|
|   Artistic Swimming|   105|   0|  105|
|           Athletics|   969|1072| 2041|
|           Badminton|    86|  87|  173|
|   Baseball/Softball|    90| 144|  234|
|          Basketball|   144| 144|  288|
|    Beach Volleyball|    48|  48|   96|
|              Boxing|   102| 187|  289|
|        Canoe Slalom|    41|  41|   82|
|        Canoe Sprint|   123| 126|  249|
|Cycling BMX Frees...|    10|   9|   19|
|  Cycling BMX Racing|    24|  24|   48|
|Cycling Mountain ...|    38|  38|   76|
|        Cycling Road|    70| 131|  201|
|       Cycling Track|    90|  99|  189|
|              Diving|    72|  71|  143|
|          Equestrian|    73| 125|  198|
|             Fencing|   107| 108|  215|
+--------------------+------+----+-----+
only showing top

In [0]:
entriesGender.printSchema()

root
 |-- Discipline: string (nullable = true)
 |-- Female: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Total: string (nullable = true)



Changing data type

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

entriesGender = entriesGender.withColumn(
    "Female",
    col("Female").cast(IntegerType())
).withColumn(
    "Male",
    col("Male").cast(IntegerType())
).withColumn(
    "Total",
    col("Total").cast(IntegerType())
)
entriesGender.printSchema()

root
 |-- Discipline: string (nullable = true)
 |-- Female: integer (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Total: integer (nullable = true)



Find the top countries with the highest number of gold medals

In [0]:
from pyspark.sql.functions import desc

top_countries_gold_medals = medals.select("TeamCountry", "Gold").orderBy(desc("Gold"))
top_countries_gold_medals.show()

+--------------------+----+
|         TeamCountry|Gold|
+--------------------+----+
|United States of ...|  39|
|People's Republic...|  38|
|               Japan|  27|
|       Great Britain|  22|
|                 ROC|  20|
|           Australia|  17|
|         Netherlands|  10|
|              France|  10|
|             Germany|  10|
|               Italy|  10|
|              Canada|   7|
|              Brazil|   7|
|         New Zealand|   7|
|                Cuba|   7|
|             Hungary|   6|
|   Republic of Korea|   6|
|              Poland|   4|
|      Czech Republic|   4|
|               Kenya|   4|
|              Norway|   4|
+--------------------+----+
only showing top 20 rows



Calculate the avg number of entries by gender for each discipline 

In [0]:
from pyspark.sql.functions import avg,round,col
entriesGender_avg = entriesGender.withColumn(
    'Avg_Male',
    round(col("Male") / col("Total"),2)
).withColumn(
    'Avg_Female',
    round(col("Female") / col("Total"),2)
)
entriesGender_avg.show()

+--------------------+------+----+-----+--------+----------+
|          Discipline|Female|Male|Total|Avg_Male|Avg_Female|
+--------------------+------+----+-----+--------+----------+
|      3x3 Basketball|    32|  32|   64|     0.5|       0.5|
|             Archery|    64|  64|  128|     0.5|       0.5|
| Artistic Gymnastics|    98|  98|  196|     0.5|       0.5|
|   Artistic Swimming|   105|   0|  105|     0.0|       1.0|
|           Athletics|   969|1072| 2041|    0.53|      0.47|
|           Badminton|    86|  87|  173|     0.5|       0.5|
|   Baseball/Softball|    90| 144|  234|    0.62|      0.38|
|          Basketball|   144| 144|  288|     0.5|       0.5|
|    Beach Volleyball|    48|  48|   96|     0.5|       0.5|
|              Boxing|   102| 187|  289|    0.65|      0.35|
|        Canoe Slalom|    41|  41|   82|     0.5|       0.5|
|        Canoe Sprint|   123| 126|  249|    0.51|      0.49|
|Cycling BMX Frees...|    10|   9|   19|    0.47|      0.53|
|  Cycling BMX Racing|  

Writing the transformed data back into data lake

In [0]:
athletes.repartition(1).write.mode("overwrite").option("header", "true").csv("dbfs:/mnt/olympicdata/transformed-data/athletes.csv")
coaches.repartition(1).write.mode("overwrite").option("header", "true").csv("dbfs:/mnt/olympicdata/transformed-data/coaches.csv")
teams.repartition(1).write.mode("overwrite").option("header", "true").csv("dbfs:/mnt/olympicdata/transformed-data/teams.csv")
entriesGender_avg.repartition(1).write.mode("overwrite").option("header", "true").csv("dbfs:/mnt/olympicdata/transformed-data/entriesGender_avg.csv")
medals.repartition(1).write.mode("overwrite").option("header", "true").csv("dbfs:/mnt/olympicdata/transformed-data/medals.csv")