In [42]:
# Default parameters that can be freely changed or overriden by pipeline run 

# Inputs
userPath = "abfss://mgdc@<your-storage>.dfs.core.windows.net/user_2022-06-01_to_2022-10-31/"
calendarPath = "abfss://mgdc@<your-storage>.dfs.core.windows.net/calendar_2022-06-01_to_2022-10-31/"
emailPath = "abfss://mgdc@<your-storage>.dfs.core.windows.net/email_2022-06-01_to_2022-10-31/"
teamsChatPath = "abfss://mgdc@<your-storage>.dfs.core.windows.net/teamschat_2022-06-01_to_2022-10-31/"
callsPath = "abfss://mgdc@<your-storage>.dfs.core.windows.net/calls_2022-06-01_to_2022-10-31/"

# Outputs
usersOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/users-DT_2022-06-01_to_2022-10-31.csv"
meetingsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/meetings-DT_2022-06-01_to_2022-10-31.csv"
emailsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/emails-DT_2022-06-01_to_2022-10-31.csv"
teamschatsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/teamschats-DT_2022-06-01_to_2022-10-31.csv"
callsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/calls-DT_2022-06-01_to_2022-10-31.csv"

meetingsParticipantsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/meetingsParticipants-DT_2022-06-01_to_2022-10-31.csv"
emailsParticipantsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/emailsParticipants-DT_2022-06-01_to_2022-10-31.csv"
teamschatsParticipantsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/teamschatsParticipants-DT_2022-06-01_to_2022-10-31.csv"
callsParticipantsOutputPath = "abfss://output@<your-storage>.dfs.core.windows.net/callsParticipants-DT_2022-06-01_to_2022-10-31.csv"

# StartDate/EndDate for this run that is denormalized to users and interactions tables
period = "2022-06-01 to 2022-10-31"

StatementMeta(onasynapsepool, 9, 12, Finished, Available)

In [43]:
# Load data
areEmailsLoaded = False
areMeetingsLoaded = False
areTeamsCallsLoaded = False
areTeamsChatsLoaded = False

try:
    emailsRaw = spark.read.json(emailPath)
    areEmailsLoaded = True
except (Exception) as error:
    print(error)
    print("Emails data not loaded, continuing without emails")

try:
    meetingsRaw = spark.read.json(calendarPath)
    areMeetingsLoaded = True
except (Exception) as error:
    print(error)
    print("Calendar data not loaded, continuing without meetings")

try:
    callsRaw = spark.read.json(callsPath)
    areTeamsChatsLoaded = True
except (Exception) as error:
    print(error)
    print("TeamsCalls data not loaded, continuing without calls")

try:
    teamsChatsRaw = spark.read.json(teamsChatPath)
    areTeamsChatsLoaded = True
except (Exception) as error:
    print(error)
    print("TeamsChats data not loaded, continuing without messages")

try:
    usersRaw = spark.read.json(userPath)
except (Exception) as error:
    print(error)
    raise Exception("Users data not loaded, check the path and whether the extraction has data")

StatementMeta(onasynapsepool, 9, 13, Finished, Available)

Path does not exist: abfss://mgdc@onastoreqow3pn6tamstg.dfs.core.windows.net/asdfcalendar_2022-06-01_to_2022-10-31
Calendar data not loaded, continuing without meetings


In [44]:
# Drop duplicates
usersDedup = usersRaw.dropDuplicates(["puser"])
if areEmailsLoaded:
    emailsDedup = emailsRaw.dropDuplicates(["Id"])
if areTeamsChatsLoaded:
    teamsChatsDedup = teamsChatsRaw.dropDuplicates(["Id"])
if areMeetingsLoaded:
    meetingsDedup = meetingsRaw.dropDuplicates(["Id"])
if areTeamsCallsLoaded:
    callsDedup = callsRaw.dropDuplicates(["CommunicationID"])

StatementMeta(onasynapsepool, 9, 14, Finished, Available)

In [45]:
from pyspark.sql.functions import coalesce, col, explode, lit, size, array_union, array, split, minute
import pyspark.sql.functions as F
from pyspark.sql import types as t

StatementMeta(onasynapsepool, 9, 15, Finished, Available)

In [46]:
# Get the user email addresses and filter emails, teamschat, calls, and meetings to only contain edges with those users
usersEmailAddresses = usersDedup.selectExpr("lower(mail) as userID")
usersDisplayNames = usersDedup.selectExpr("lower(DisplayName) as name")

StatementMeta(onasynapsepool, 9, 16, Finished, Available)

In [47]:
if areEmailsLoaded:
    # Emails - Dedup
    emails = emailsDedup.select(col("id"), F.lower(col("Sender.EmailAddress.Address")).alias("sender"), col("createdDateTime"), col("importance"), col("receivedDateTime"), col("sentDateTime"), col("conversationId"), col("isRead"), col("isDraft"), size(array_union(col("ToRecipients"),col("ccRecipients"))).alias("NumberOfRecipients")) 

    # Emails - Explode row with one sender -> N recipients into N rows
    emailsParticipants = emailsDedup.select(col("id").alias("mailID"), F.lower(col("Sender.EmailAddress.Address")).alias("sender"), explode(array_union(col("ToRecipients"),col("ccRecipients"))).alias("Recipients")) \
                         .join(usersEmailAddresses, col("userID") == col("sender"), "inner").drop("userID") \
                         .join(usersEmailAddresses, col("userID") == F.lower(col("Recipients.EmailAddress.Address")), "inner").drop("userID") \
                         .withColumn("participant", F.lower(col("Recipients.EmailAddress.Address"))).drop("Recipients") \
                         .where(col("sender") != col("participant"))

StatementMeta(onasynapsepool, 9, 17, Finished, Available)

In [48]:
if areTeamsChatsLoaded:
    # TeamsChats - Dedup
    teamsChats = teamsChatsDedup.select(col("id"), col("ConversationId"), F.lower(col("Sender.EmailAddress.Address")).alias("sender"), col("ReceivedDateTime"), col("SentDateTime"), size(array_union(col("ToRecipients"),array(col("Sender")))).alias("TotalParticipants")) 

    # TeamsChats - Explode row with one sender -> N recipients into N rows
    teamsChatsParticipants = teamsChatsDedup.select(col("id").alias("chatID"), F.lower(col("Sender.EmailAddress.Address")).alias("sender"), explode(col("ToRecipients")).alias("Recipients")) \
                        .join(usersEmailAddresses, col("userID") == col("sender"), "inner").drop("userID") \
                        .join(usersEmailAddresses, col("userID") == F.lower(col("Recipients.EmailAddress.Address")), "inner").drop("userID") \
                        .withColumn("participant", F.lower(col("Recipients.EmailAddress.Address"))).drop("Recipients") \
                        .where(col("sender") != col("participant"))

StatementMeta(onasynapsepool, 9, 18, Finished, Available)

In [49]:
if areMeetingsLoaded:
    # Meetings - Dedup
    meetings = meetingsDedup.select(col("id"), col("iCalUId"), F.lower(col("organizer.EmailAddress.Address")).alias("sender"), col("createdDateTime"), col("start.dateTime").alias("startDateTime"), col("end.dateTime").alias("endDateTime"), (minute(col("end.dateTime"))-minute(col("start.dateTime"))).alias("DurationInMinutes"), col("importance"), col("isCancelled"), col("isOrganizer"), col("type"), size(col("attendees")).alias("TotalAttendees")) 

    # Meetings - Explode row with one sender -> N recipients into N rows
    meetingsParticipants = meetingsDedup.select(col("id").alias("meetingID"), F.lower(col("organizer.EmailAddress.Address")).alias("organizer"), explode(col("attendees")).alias("Attendee")) \
                        .join(usersEmailAddresses, col("userID") == col("organizer"), "inner").drop("userID") \
                        .join(usersEmailAddresses, col("userID") == F.lower(col("Attendee.EmailAddress.Address")), "inner").drop("userID") \
                        .withColumn("participant", F.lower(col("Attendee.EmailAddress.Address"))) \
                        .withColumn("participantResponse", split(F.lower(col("Attendee.Status.response")),"'")[1]) \
                        .withColumn("participantType", split(F.lower(col("Attendee.type")),"'")[1]).drop("Attendee") \
                        .where(col("organizer") != col("participant"))

StatementMeta(onasynapsepool, 9, 19, Finished, Available)

In [50]:
if areTeamsCallsLoaded:
    # Calls - Dedup
    calls = callsDedup.select(col("Id"), col("CommunicationId"), col("ICalUId"), col("CommunicationType"), col("CommunicationSubType"), F.lower(col("Organizer.DisplayName")).alias("Organizer"), F.lower(col("Organizer.userAADObjectId")).alias("OrganizerId"), col("CreatedDateTime"), col("EndTime"), col("StartTime"), (minute(col("EndTime"))-minute(col("StartTime"))).alias("DurationInMinutes"), col("ThreadId"), size(col("Attendees")).alias("TotalAttendees")) 

    # Calls - Explode row with one sender -> N recipients into N rows
    callsParticipants = callsDedup.select(col("CommunicationId").alias("callID"), F.lower(col("Organizer.DisplayName")).alias("Organizer"), explode(col("Attendees")).alias("Attendee")) \
                        .join(usersDisplayNames, col("name") == col("Organizer"), "inner").drop("name") \
                        .join(usersDisplayNames, col("name") == F.lower(col("Attendee.DisplayName")), "inner").drop("name") \
                        .withColumn("Participant", F.lower(col("Attendee.DisplayName"))).drop("Attendee") \
                        .where(col("Organizer") != col("Participant"))

StatementMeta(onasynapsepool, 9, 20, Finished, Available)

In [51]:
# Select user properties for output
usersDedup = usersDedup.withColumn("EmailAddress", F.lower(col("mail")))
usersRenamed = usersDedup.selectExpr("id as id",  "displayName as Name", "EmailAddress", "department as Department", "jobTitle as Title", "state as StateOrProvince",
                                     "country as Country","preferredLanguage as Languages","ptenant as TenantID")

StatementMeta(onasynapsepool, 9, 21, Finished, Available)

In [52]:
# Output files
usersEnriched = usersRenamed.withColumn("Period", lit(period))
usersEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(usersOutputPath)

if areEmailsLoaded:
    emailsEnriched = emails.withColumn("Period", lit(period))
    emailsEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(emailsOutputPath)
    emailsParticipants.coalesce(1).write.option("header", True).mode("overwrite").csv(emailsParticipantsOutputPath)

if areMeetingsLoaded:
    meetingsEnriched = meetings.withColumn("Period", lit(period))
    meetingsEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(meetingsOutputPath)
    meetingsParticipants.coalesce(1).write.option("header", True).mode("overwrite").csv(meetingsParticipantsOutputPath)

if areTeamsChatsLoaded:
    teamsChatsEnriched = teamsChats.withColumn("Period", lit(period))
    teamsChatsEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(teamschatsOutputPath)
    teamsChatsParticipants.coalesce(1).write.option("header", True).mode("overwrite").csv(teamschatsParticipantsOutputPath)

if areTeamsCallsLoaded:
    callsEnriched = calls.withColumn("Period", lit(period))
    callsEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(callsOutputPath)
    callsParticipants.coalesce(1).write.option("header", True).mode("overwrite").csv(callsParticipantsOutputPath)

StatementMeta(onasynapsepool, 9, 22, Finished, Available)