In [1]:
# # #Initializing the parameters
# # Read about how a Parameter cell should be used for definign and initializing
# # parameters in Synapse

# Storage Account Name
StorageAccountName = ""
# Main container/directory on the storage account
VivaInsightsDataFileSystem = ""

PipelineId = ""
MeetingQueryDatasetFolder = ""
SecondaryEmployeeId = ""

# Database connection information
SQLServerEndpoint = ""
DBName = ""
DBUser = ""
DBPass = ""
DBPort = ""


StatementMeta(SparkPool2, 42, 1, Finished, Available)

In [2]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import *


print("PipelineId is: ", PipelineId)


StatementMeta(SparkPool2, 42, 2, Finished, Available)

PipelineId is:  6f714f55-1b3c-4a32-ad47-deb4a3741f3f

In [3]:

#Reading meeting csv file from storage account
inputFilePath = 'abfss://{0}@{1}.dfs.core.windows.net/{2}/raw/{3}/*.txt'.format(VivaInsightsDataFileSystem, StorageAccountName, PipelineId, MeetingQueryDatasetFolder)
meetingDf = spark.read.csv(inputFilePath, header = 'true', inferSchema= 'true')

# Cleaning MeetingId column, removing the datetime portion 
meetingDf = meetingDf.withColumn("MeetingId", split(col("MeetingId"), ":").getItem(0))


# Dataframe prep

meetingDf = meetingDf.withColumn("StartDate",meetingDf['StartDate'].cast(DateType()))
meetingDf = meetingDf.withColumn("EndDate",meetingDf['EndDate'].cast(DateType()))

meetingDf = meetingDf.withColumn("StartTimestampUTC", to_timestamp(concat_ws(" ", meetingDf.StartDate, meetingDf.StartTimeUTC)))
meetingDf = meetingDf.withColumn("EndTimestampUTC", to_timestamp(concat_ws(" ",meetingDf.EndDate,  meetingDf.EndTimeUTC)))


meetingDf = meetingDf.withColumnRenamed("Organizer_"+SecondaryEmployeeId,"Organizer_EmployeeId")


meetingDf.createOrReplaceTempView('meetingDf')



StatementMeta(SparkPool2, 42, 3, Finished, Available)

In [4]:
# # Checking the Database for the last existing record
jdbcHostname = SQLServerEndpoint
jdbcDatabase = DBName
jdbcPort = DBPort

jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
   "user" : DBUser,
   "password" : DBPass,
   "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}
pushdown_query = "(Select max(StartTimestampUTC) as temp from viva_insights_meeting) tempTbl"
latestExistingDate = spark.read.jdbc(url=jdbcUrl, table=pushdown_query, properties=connectionProperties).first().temp
print("Latest existing meeting date in DB is", latestExistingDate)



StatementMeta(SparkPool2, 42, 4, Finished, Available)

Latest existing meeting date in DB is 2021-11-13 20:58:42

In [5]:
# Preparing dataframe for upsert/insert into database
# # Record selection
if (latestExistingDate == None):
    outputStatus = "FullUpload"
    outputDf = meetingDf
else:
    outputStatus = "PartialUpload"
    latestExistingDate = str(latestExistingDate.date())
    outputDf = meetingDf[meetingDf.StartTimestampUTC > latestExistingDate]

# Attribute selection
columns = ["MeetingId","StartTimestampUTC","EndTimestampUTC", "Organizer_PersonId","Organizer_EmployeeId", "Organizer_Organization", "Organizer_LevelDesignation"
, "Organizer_IsInternal","Attendees","Attendees_with_conflicting_meetings", "Invitees", "Emails_sent_during_meetings"
, "Instant_messages_sent_during_meetings" , "Attendees_multitasking", "Attendee_meeting_hours", "Redundant_attendees"
, "Total_meeting_cost", "Total_redundant_hours" , "IsCancelled", "DurationHours", "IsRecurring","Subject" ,"TotalAccept"
, "TotalNoResponse", "TotalDecline" , "TotalNoEmailsDuringMeeting", "TotalNoDoubleBooked", "TotalNoAttendees"
, "MeetingResources", "BusinessProcesses"]

outputDf = outputDf.select([col for col in columns])

# display(outputDf)
print("OutputStatus is: ", outputStatus)
print("Number of records inserted is: ", outputDf.count())


StatementMeta(SparkPool2, 42, 5, Finished, Available)

OutputStatus is:  PartialUpload
Number of records inserted is:  9

In [6]:
# Insert/Upsert into database
mode = "append"
outputDf.write.jdbc(url=jdbcUrl, table="dbo.viva_insights_meeting", mode=mode, properties=connectionProperties)

StatementMeta(SparkPool2, 42, 6, Finished, Available)