In [None]:
# # #Initializing the parameters
# # Read about how a Parameter cell should be used for definign and initializing
# # parameters in Synapse

# Storage Account Name
StorageAccountName = ""
# Main container/directory on the storage account
VivaInsightsDataFileSystem = ""

PipelineId = ""
PersonQueryDatasetFolder = ""
SecondaryEmployeeId = ""

# Database connection information
SQLServerEndpoint = ""
DBName = ""
DBUser = ""
DBPass = ""
DBPort = ""




In [None]:
import sys
import json


from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import *


# constants_ path template to access storage account for read and write
inputFilePath = "abfss://{}@{}.dfs.core.windows.net/{}/raw/{}/*.txt"

storageAccount = "{}.dfs.core.windows.net"
outputFilePath = "https://{}.dfs.core.windows.net/{}/{}"


#Setting Prameters
extractionFS = VivaInsightsDataFileSystem


print("PipelineId is: ", PipelineId)

In [None]:
from pyspark.sql.types import DateType

#Reading meeting csv file from storage account
personDf = spark.read.csv(inputFilePath.format(extractionFS, StorageAccountName, PipelineId, PersonQueryDatasetFolder), header = 'true', inferSchema= 'true')

# Dataframe prep
personDf = personDf.withColumn("Date",personDf['Date'].cast(DateType()))
personDf = personDf.withColumnRenamed(SecondaryEmployeeId,"EmployeeId")

# personDf.printSchema()
personDf.createOrReplaceTempView('personDf')

In [None]:
# # Checking the Database for the last existing record
jdbcHostname = SQLServerEndpoint
jdbcDatabase = DBName
jdbcPort = DBPort
username = DBUser
password = DBPass
jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
   "user" : username,
   "password" : password,
   "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}
pushdown_query = "(Select max(Date) as temp from viva_insights_person) tempTbl"
latestExistingDate = spark.read.jdbc(url=jdbcUrl, table=pushdown_query, properties=connectionProperties).first().temp
print("Latest existing person date in DB is", latestExistingDate)


In [None]:
# Preparing dataframe for upsert/insert into database
# # Record selection
if (latestExistingDate == None):
    outputStatus = "FullUpload"
    outputDf = personDf
else:
    outputStatus = "PartialUpload"
    outputDf = personDf[personDf.Date > latestExistingDate]


# Attribute selection

columns = ["PersonId","EmployeeId", "Date", "Organization", "LevelDesignation", "Workweek_span", "Meetings_with_skip_level", "Meeting_hours_with_skip_level"
, "Generated_workload_email_hours", "Generated_workload_email_recipients", "Generated_workload_instant_messages_hours"
, "Generated_workload_instant_messages_recipients", "Generated_reactions_to_posts", "Generated_replies_to_posts"
, "Generated_workload_call_hours", "Generated_workload_call_participants", "Generated_workload_calls_organized"
, "External_network_size", "Internal_network_size", "Networking_outside_company", "Networking_outside_organization"
, "Multitasking_hours", "After_hours_meeting_hours", "Open_1_hour_block", "Open_2_hour_blocks", "Total_focus_hours"
, "Low_quality_meeting_hours", "Meetings", "Meeting_hours", "Conflicting_meeting_hours", "Multitasking_meeting_hours"
, "Redundant_meeting_hours__lower_level_", "Redundant_meeting_hours__organizational_"
, "Time_in_self_organized_meetings", "Meeting_hours_during_working_hours", "Generated_workload_meeting_attendees"
, "Generated_workload_meeting_hours", "Generated_workload_meetings_organized", "Manager_coaching_hours_1_on_1"
, "Meetings_with_manager", "Meeting_hours_with_manager", "Meetings_with_manager_1_on_1"
, "Meeting_hours_with_manager_1_on_1", "After_hours_instant_messages", "Instant_messages_sent", "Instant_Message_hours"
, "Working_hours_instant_messages", "Emails_sent", "Email_hours", "Uninterrupted_focus_hours"
, "After_hours_collaboration_hours", "Collaboration_hours_external", "Collaboration_hours"
, "Working_hours_collaboration_hours", "After_hours_email_hours", "Working_hours_email_hours"
, "Channels_with_active_engagement", "Teams_with_active_engagement", "After_hours_channel_message_hours"
, "Channel_message_hours", "Channel_messages_sent", "Channel_reactions", "Channel_visits"
, "Working_hours_channel_message_hours", "After_hours_in_calls", "Total_calls", "Call_hours"
, "Working_hours_in_calls", "IsInternal", "IsActive", "WorkingStartTimeSetInOutlook", "WorkingEndTimeSetInOutlook"
, "WorkingDaysSetInOutlook"]

outputDf = outputDf.select([col for col in columns])

# display(outputDf)
print("OutputStatus is: ", outputStatus)
print("Number of records inserted is: ", outputDf.count())



In [None]:
# Insert/Upset into database

mode = "append"
url = "jdbc:sqlserver://mgdcvivasynapse.sql.azuresynapse.net:1433;database=VivaInsights"
outputDf.write.jdbc(url=jdbcUrl, table="dbo.viva_insights_person", mode=mode, properties=connectionProperties)