# Project 2_ Team 3 _ Silver to Gold Layer


## Import the necessary libraries/functions

In [0]:

import pyspark.sql.functions as F
from pyspark.sql.functions import col, count, countDistinct, split, reverse
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, DateType, FloatType, BooleanType


## Configure access to our Azure Storage account container:

In [0]:

contname = 'team3-project2' #azure storage account container
storage_acct_name = '20230821desa'
client_id = 'de4ff859-02b1-4e2f-9d16-b578fa03df4f' #aka: app id
tenant_id = '33da9f3f-4c1a-4640-8ce1-3f63024aea1d' #aka: directory id
service_credential = dbutils.secrets.get(scope="databricks-app-kv",key="databricks-application")


## Configure Spark to access Azure Storage securely using OAuth-based authentication:

In [0]:

spark.conf.set(f"fs.azure.account.auth.type.{storage_acct_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_acct_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_acct_name}.dfs.core.windows.net", f"{client_id}")
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_acct_name}.dfs.core.windows.net", service_credential)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_acct_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

## Read the Gold layer from the container

In [0]:
Event_Table = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/Event_Table/")
repo_df = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/repo_df/")
actor_df = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/actor_df/")
organization_df = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/organization_df/")
PushEvent_df =  spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/PushEvent_df/")
ReleaseEvent_df = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/ReleaseEvent_df/")
CommitCommentEvent_df = spark.read.parquet(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/SilverLayer/CommitCommentEvent_df/")

## Prepare the tables for the Gold layer


## Event Table

In [0]:
Event_Table = Event_Table.drop("repo_name")
Event_Table = Event_Table.dropDuplicates()
# Event_Table.limit(5).display()

event_id,created_at,actor_id,org_id,type,repo_id
19960749167,2022-01-28T08:09:51.000+0000,6414648,0,IssueCommentEvent,28530868
19557023836,2022-01-03T11:03:24.000+0000,65212910,0,PushEvent,222505696
19659257050,2022-01-10T15:29:00.000+0000,96717210,0,PushEvent,442083252
19939165300,2022-01-27T06:04:58.000+0000,25180681,0,PushEvent,267364785
19765632123,2022-01-17T01:37:00.000+0000,41898282,0,PushEvent,446196561



## repo_df

In [0]:
repo_df = repo_df.drop("repo_url")
repo_df = repo_df.dropDuplicates()
# repo_df.limit(5).display()

repo_id,repo_name
363663608,visualpython/visualpython
451378741,nostradamuss21/fantasy_football
443718594,matsuyuki-a/vercel-trial
447687837,viddex/Spotify-AD-Blocker
108427186,sed-inf-u-szeged/OpenStaticAnalyzer



## actor_df

In [0]:
actor_df = actor_df.select("actor_id", "login")
actor_df = actor_df.dropDuplicates()
# actor_df.limit(5).display()

actor_id,login
19560074,QualitySoftwareDeveloper
81461416,stcdan2x
8092460,davidbritch
97944394,Ngima-Muraguri
67475495,jsru-1



## organization_df

In [0]:
organization_df = organization_df.select("org_id", "login")
organization_df = organization_df.dropDuplicates()
# organization_df.limit(5).display()

org_id,login
3380462,prometheus
44949219,MT-CTF
25274995,next-step
16104327,thought-machine
22757041,webflextech



## PushEvent_df

In [0]:
PushEvent_df = PushEvent_df.drop("push_id_PK")
PushEvent_df = PushEvent_df.withColumn("ref", reverse(split(col("ref"), "/"))[0])
PushEvent_df = PushEvent_df.dropDuplicates()
# PushEvent_df.limit(5).display()

event_id,push_id,distinct_size,ref,commits_message
19794776056,8868914806,2,next,6.5.0-alpha.20 next.json version file
19937012458,8941250464,1,self-hosted-runner,#self-hosted-runner fix .
19649037603,8792293414,1,master,ipipip
19994826914,8972063211,1,main,Create 空
19546717121,8735984884,1,main,upload file a275ae7117c38183b2162aef8e8bd4cfacd2d44395cdeb1d5b59834afa9ab3a80c788e678f2a94166dfdcaebfd088b10video_732_0_552406.ts



## ReleaseEvent_df

In [0]:
ReleaseEvent_df = ReleaseEvent_df.drop("action")
ReleaseEvent_df = ReleaseEvent_df.dropDuplicates()
# ReleaseEvent_df.limit(5).display()

event_id,release_id
19658282851,47236153
19545885268,56257423
19546052018,56258282
19546854032,56261343
19547870688,56266097



## CommitCommentEvent_df

In [0]:
CommitCommentEvent_df = CommitCommentEvent_df.drop("commit_id_PK", "body")
CommitCommentEvent_df = CommitCommentEvent_df.dropDuplicates()
# CommitCommentEvent_df.limit(5).display()

event_id,commit_id,commit_created_at,user_id
19619478792,ab9d11e7e9aacbf62be48d35732e00c7eb4b1d76,2022-01-07T03:14:09.000+0000,35613825
19980567097,8c2cf14407da95c0243908b820669d590ebdad4d,2022-01-29T20:57:31.000+0000,35613825
19578481104,c8a6a428d4e77ea412ca7e4c738c37aac90738c6,2022-01-04T17:59:22.000+0000,28784688
19749402047,3e79b592cdee7975f38e87e3ac7d1c32e6748f64,2022-01-15T00:04:30.000+0000,40862150
19789059769,4fb6ebdb164faffded22e04a3f4c3a1175a33266,2022-01-18T10:13:53.000+0000,35613825



# Write to Gold Layer


## Table names in string format

In [0]:
table_string_names = ["Event_Table",
"repo_df",
"actor_df",
"organization_df",
"PushEvent_df",
"ReleaseEvent_df",
"CommitCommentEvent_df"
]


## List containing our dataframes

In [0]:
table_names = [Event_Table,            
repo_df,                                                
actor_df,                                  
organization_df,                           
PushEvent_df,                             
ReleaseEvent_df,                            
CommitCommentEvent_df                   
 ]                                           
   

# Sizes were repartioned by checking our gold layer file size for one partition, multiplying by the number of partitions from the silver layer
partition_size = [21, 2, 1, 1, 38, 1, 1]


## Loop through each table and write to gold layer

In [0]:
for indx, table_name in enumerate(table_names):
    df_name = table_name
    df_name.repartition(partition_size[indx]).write.format("parquet").mode("overwrite").save(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/GoldLayer/{table_string_names[indx]}")
    


## check for successful confirmation

In [0]:
# After writing files to the datalake, run:
display(dbutils.fs.ls(f"abfss://{contname}@{storage_acct_name}.dfs.core.windows.net/"))

path,name,size,modificationTime
abfss://team3-project2@20230821desa.dfs.core.windows.net/BronzeLayer/,BronzeLayer/,0,1695766198000
abfss://team3-project2@20230821desa.dfs.core.windows.net/GoldLayer/,GoldLayer/,0,1696552313000
abfss://team3-project2@20230821desa.dfs.core.windows.net/SilverLayer/,SilverLayer/,0,1696540462000
