Copyright (c) Microsoft Corporation.

Licensed under the MIT License.

In [7]:
data_lake_account_name = '' # Synapse Workspace ADLS
file_system_name = ''
subfolder_name = ''
folder_name = 'events'
user_group_name = ''
initialLoad = 'false'

In [17]:
base_path = f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/"

In [18]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
def get_location_flag(Location_DisplayName): 
    location_flag = 'InPerson'
    if 'teams' in Location_DisplayName or 'zoom' in Location_DisplayName \
    or 'webex' in Location_DisplayName or 'loopup' in Location_DisplayName \
    or 'dial' in Location_DisplayName or 'conference' in Location_DisplayName  \
    or '+1' in Location_DisplayName or 'meeting' in Location_DisplayName  \
    or 'cell' in Location_DisplayName or '1-8' in Location_DisplayName  \
    or 'code' in Location_DisplayName or '888' in Location_DisplayName  \
    or '800' in Location_DisplayName or 'call' in Location_DisplayName  \
    or '#' in Location_DisplayName or 'google meet' in Location_DisplayName :
        location_flag = 'Remote'
    return location_flag
    
def load_events_json_file(eventsPath):
    events_df= spark.read.load(eventsPath, format='json')

    location_flag_udf = udf(lambda Location_DisplayName: get_location_flag(Location_DisplayName), returnType=StringType())

    df = events_df.select('*',size('Attendees').alias('Attendees_cnt')) \
                .select(explode(col("Attendees")).alias("Attendees"),"Id","puser","ICalUid","Subject","Recurrence","IsCancelled","Start","End","CreatedDateTime","Organizer","Attendees_cnt","Location") \
                .select(["Id","puser","ICalUid","Subject","Attendees_cnt",
                            when(col("Recurrence").isNull(),False).otherwise(True).alias("Recurrence"),
                            "IsCancelled",
                            col("Start.DateTime").alias("Start"),col("End.DateTime").alias("End"),
                            col("CreatedDateTime").alias("CreatedDateTime"),
                            col("Organizer.EmailAddress.Address").alias("Organizer"),
                            col("Attendees.EmailAddress.Address").alias("Attendee"),
                            col("Attendees.Status.Response").alias("Attendee_Response"),
                            col("Attendees.Type").alias("Attendee_Type"),
                            col("Location.Address.Type").alias("Location_Address_Type"),
                            col("Location.DisplayName").alias("Location_DisplayName")]) \
                .withColumn('location_flag', location_flag_udf(lower(col("Location_DisplayName")))) \
                .withColumn("LoadDateRange", lit(subfolder_name)) \
                .withColumn("UserGroup", lit(user_group_name))
    try:
        df = df.withColumn('Start', to_timestamp('Start')) \
                .withColumn('Start_Date', to_date('Start')) \
                .withColumn('End', to_timestamp('End')) \
                .withColumn('End_Date', to_date('End')) \
                .withColumn('Created_Date', to_date('CreatedDateTime')) \
                .withColumn('Organizer', lower(col('Organizer'))) \
                .withColumn('Attendee', lower(col('Attendee'))) \
                .withColumn('Organizer_Domain', reverse(split(lower(col('Organizer')),'@'))[0]) \
                .withColumn('Attendee_Domain', reverse(split(lower(col('Attendee')),'@'))[0]) \
                .select('Id','puser','ICalUid','Subject','Recurrence','IsCancelled', \
                        'Start','Start_Date','End','CreatedDateTime','Created_Date','Organizer','Attendee','Attendee_Response', \
                        'Attendee_Type','Organizer_Domain','Attendee_Domain','Attendees_cnt','location_flag','LoadDateRange')
    except:
        pass


    processed_path = base_path +"o365data_processed/" + folder_name + "/" + user_group_name + "/" + subfolder_name
    df.write.format("parquet").mode("append").option("overwriteSchema", "true").save(processed_path)
  
def get_event_subfolder_files(folder):
    children = mssparkutils.fs.ls(folder)
    for child in children:
        if child.name == 'metadata':
            continue
        if child.isDir:
            get_event_subfolder_files(child.path)
        else:
            load_events_json_file(child.path)

In [19]:
eventsPath = base_path +"o365data/" + folder_name + "/" + user_group_name + "/" + subfolder_name


get_event_subfolder_files(eventsPath) 

In [20]:
processed_path = base_path +"o365data_processed/" + folder_name + "/" + user_group_name + "/" + subfolder_name
df_events = spark.read.format("parquet").load(processed_path,header=True)
df_events = df_events.select('Id','Organizer','Attendee','Start_Date','CreatedDateTime','Created_Date','Organizer_Domain','Attendee_Domain','Attendee_Response')

df_events = df_events.withColumn('IsReversed_Row',lit(0))

df_events_copy = df_events.select('Id','Organizer','Attendee','Start_Date','CreatedDateTime','Created_Date','Organizer_Domain','Attendee_Domain','Attendee_Response') \
                              .withColumn('Organizer',col('Attendee')) \
                              .withColumn('Attendee',col('Organizer')) \
                              .withColumn('Organizer_Domain',col('Attendee_Domain')) \
                              .withColumn('Attendee_Domain',col('Organizer_Domain')) \
                              .withColumn('IsReversed_Row',lit(1))
df_events = df_events.union(df_events_copy)

df_events.write.mode("append").saveAsTable("eventsdata")