In [None]:
from pyspark.sql.functions import count,col,when,isnan,row_number,date_format,from_utc_timestamp
from pyspark.sql.types import TimestampType
from pyspark.sql.window import Window

In [None]:
input_path = '/mnt/tokyo_olympic/Silver-Layer/' # Path to silver layer in medallion architecture
file_names = dbutils.fs.ls(input_path) 
dfs = {} # Creating dictionary to hold dataframes from silver layer 
Gold_dfs = {} # Creating dictionary to load transformed dataframes to gold layer  

In [None]:
# Loading files in silver layer to dataframes in dictionary object dfs
for file_name in file_names:
    var = file_name.name.split('.')[0]
    dfs[var[:-8:]] = spark.read.format('csv').option("header","true").option("InferSchema","true").load(file_name.path + var[:-8:] + '.csv/*')

In [None]:
print(f"The files imported from silver layer are : {dfs.keys()}")

The files imported from silver layer are : dict_keys(['athlete_event', 'athletes', 'coaches', 'countries_medal_fact', 'countries', 'discipline_gender_fact', 'discipline', 'event', 'medals_fact', 'teams'])


In [None]:
#Creating temp view for all imported silver layer dataframes
for df_name,df_value in dfs.items():
    dfs[df_name].createOrReplaceTempView(df_name)

In [None]:
# Defining business logic to create Athletes Gold Dataframe
Gold_dfs['athletes'] = spark.sql("Select a.Athlete_SID,a.Athlete_Name,a.Age,a.Gender,b.Country_SID,c.Discipline_SID from \
                           athletes a left join countries b on a.Country = b.Country\
                            left join discipline c on a.Discipline = c.Discipline")   

In [None]:
# Creating Countries Gold Dataframe
Gold_dfs['countries'] = dfs['countries'].select("Country_SID","Country")

In [None]:
# Creating coaches Gold Dataframe 
Gold_dfs['coaches'] = spark.sql("Select a.Coach_SID,a.Coach_Name, b.Country_SID,c.Discipline_SID from \
                           coaches a left join countries b on a.Country = b.Country\
                            left join discipline c on a.Discipline = c.Discipline")

In [None]:
# Creating discipline gender fact Gold Dataframe
Gold_dfs['discipline_gender_fact'] = dfs['discipline_gender_fact'] 

In [None]:
#Creating discipline Gold Dataframe
Gold_dfs['discipline'] = dfs['discipline'] 

In [None]:
#Creating event Gold Dataframe
Gold_dfs['event'] = dfs['event']

In [None]:
# Creating medal countries fact Gold Dataframe
Gold_dfs['countries_medal_fact'] = spark.sql("Select b.Country_SID,a.* except(Country) from \
                                     countries_medal_fact a left join countries b on a.Country = b.Country")  

In [None]:
# Creating medals fact Gold Dataframe
Gold_dfs['medals_fact'] = spark.sql("Select a.Medals_Fact_SID,a.Medal_Date, b.Athlete_SID,c.Event_SID,\
                              a.Gold_Medals, a.Silver_Medals, a.Bronze_Medals from medals_fact a\
                              join athletes b on (a.Athlete_Name = b.Athlete_Name and a.Country = b.Country and a.Discipline = b.Discipline)\
                              join event c on a.Event = c.Event\
                              order by a.Medals_Fact_SID") 

In [None]:
# Creating teams Gold Dataframe
Gold_dfs['teams'] = spark.sql("Select a.Team_SID,a.Team_Name, b.Discipline_SID,c.Country_SID,d.Event_SID \
                       from teams a left join discipline b on a.Discipline = b.Discipline\
                                    left join countries c on a.Country = c.Country\
                                    left join event d on a.Event = d.Event\
                                    order by a.Team_SID")

In [None]:
print(f"Loading transformed Dataframes in Gold Layer : {Gold_dfs.keys()}")

Loading transformed Dataframes in Gold Layer : dict_keys(['athletes', 'countries', 'coaches', 'discipline_gender_fact', 'discipline', 'event', 'countries_medal_fact', 'medals_fact', 'teams'])


In [None]:
#Loading dataframes into Gold Layer:
for df_name,df_value in Gold_dfs.items():
    temp_output_path = '/mnt/tokyo_olympic/Gold-Layer/' + df_name + '/' + df_name + '_gold/'
    output_path = '/mnt/tokyo_olympic/Gold-Layer/' + df_name '/' + df_name + '.parquet'
    df_value.repartition(1).write.mode("overwrite").option("header","true").csv(temp_output_path)
    filenames = dbutils.fs.ls(temp_output_path)
    for filename in filenames:
        if filename.name.endswith('.parquet'):
            name = filename.name
    dbutils.fs.cp(temp_output_path + name, output_path)
    dbutils.fs.rm(temp_output_path,recurse=True)
    

