In [1]:
# Default parameters that can be freely changed or overriden by pipeline run 

# Inputs
calendarPath = "abfss://mgdc@onastorage.dfs.core.windows.net/calendar_2022-06-01_to_2022-11-07/"
emailPath = "abfss://mgdc@onastorage.dfs.core.windows.net/email_2022-06-01_to_2022-11-07/"
teamsChatPath = "abfss://mgdc@onastorage.dfs.core.windows.net/teamschat_2022-06-01_to_2022-11-07/"
userPath = "abfss://mgdc@onastorage.dfs.core.windows.net/user_2022-06-01_to_2022-11-07/"

# Output path of user vertices
usersOutputPath = "abfss://output@onastorage.dfs.core.windows.net/users_2022-06-01_to_2022-11-07.csv"

# Output path of user to user edges
interactionsOutputPath = "abfss://output@onastorage.dfs.core.windows.net/interactions_2022-06-01_to_2022-11-07.csv"

# StartDate/EndDate for this run that is denormalized to users and interactions tables
period = "2022-06-01 to 2022-11-07"

# Whether or not to md5 hash the input user emails
obfuscateEmails = True

StatementMeta(onasynapsepool, 99, 1, Finished, Available)

In [2]:
# Load data
meetingsRaw = spark.read.json(calendarPath)
emailsRaw = spark.read.json(emailPath)
teamschatsRaw = spark.read.json(teamsChatPath)
usersRaw = spark.read.json(userPath)

StatementMeta(onasynapsepool, 99, 2, Finished, Available)

In [3]:
# Drop duplicates
usersDedup = usersRaw.dropDuplicates(["puser"])
emailsDedup = emailsRaw.dropDuplicates(["Id"]).select("Sender", "ToRecipients")
teamschatsDedup = teamschatsRaw.dropDuplicates(["Id"]).select("Sender", "ToRecipients")
meetingsDedup = meetingsRaw.dropDuplicates(["Id"]).select("organizer", "attendees")

StatementMeta(onasynapsepool, 99, 3, Finished, Available)

In [4]:
from pyspark.sql.functions import coalesce, col, count, explode, format_number, isnull, lit, md5, rand, size, udf
import pyspark.sql.functions as F
from pyspark.sql import types as t

StatementMeta(onasynapsepool, 99, 4, Finished, Available)

In [5]:
# Get the user email addresses and filter emails, teamschat, and meetings to only contain edges with those users
usersEmailAddresses = usersDedup.selectExpr("lower(mail) as id")

StatementMeta(onasynapsepool, 99, 5, Finished, Available)

In [6]:
# Explode row with one sender -> N recipients into N rows
# Filter to only keep emails with 5 or less recipients
emails = emailsDedup.where(size(col("ToRecipients")) <= 5) \
                    .select(F.lower(col("Sender.EmailAddress.Address")).alias("sender"), explode(col("ToRecipients")).alias("exploded")) \
                    .join(usersEmailAddresses, col("id") == col("sender"), "inner").drop("id") \
                    .join(usersEmailAddresses, col("id") == F.lower(col("exploded.EmailAddress.Address")), "inner").drop("id") \
                    .withColumn("src", col("sender")) \
                    .withColumn("dst", F.lower(col("exploded.EmailAddress.Address"))) \
                    .select(col("src"), col("dst")) \
                    .where(col("src") != col("dst"))
if obfuscateEmails:
    emails = emails.withColumn("srcHash", md5(col("src"))) \
                   .withColumn("dstHash", md5(col("dst"))) \
                   .drop("src", "dst").selectExpr("srcHash as src", "dstHash as dst")

StatementMeta(onasynapsepool, 99, 6, Finished, Available)

In [7]:
# Explode row with one sender -> N recipients into N rows
# Filter to only keep teamschat messages with 5 or less recipients
teamschats = teamschatsDedup.where(size(col("ToRecipients")) <= 5) \
                            .select(F.lower(col("Sender.EmailAddress.Address")).alias("sender"), explode(col("ToRecipients")).alias("exploded")) \
                            .join(usersEmailAddresses, col("id") == col("sender"), "inner").drop("id") \
                            .join(usersEmailAddresses, col("id") == F.lower(col("exploded.EmailAddress.Address")), "inner").drop("id") \
                            .withColumn("src", col("sender")) \
                            .withColumn("dst", F.lower(col("exploded.EmailAddress.Address"))) \
                            .select(col("src"), col("dst")) \
                            .where(col("src") != col("dst"))
if obfuscateEmails:
    teamschats = teamschats.withColumn("srcHash", md5(col("src"))) \
                           .withColumn("dstHash", md5(col("dst"))) \
                           .drop("src", "dst").selectExpr("srcHash as src", "dstHash as dst")

StatementMeta(onasynapsepool, 99, 7, Finished, Available)

In [8]:
# Explode row with one organizer -> N attendees into N rows
# Filter to only keep meetings with 5 or less attendees
meetings = meetingsDedup.where(size(col("attendees")) <= 5) \
                        .select(F.lower(col("organizer.emailAddress.address")).alias("sender"), explode(col("attendees")).alias("exploded")) \
                        .join(usersEmailAddresses, col("id") == col("sender"), "inner").drop("id") \
                        .join(usersEmailAddresses, col("id") == F.lower(col("exploded.EmailAddress.Address")), "inner").drop("id") \
                        .withColumn("src", col("sender")) \
                        .withColumn("dst", F.lower(col("exploded.EmailAddress.Address"))) \
                        .select(col("src"), col("dst")) \
                        .where(col("src") != col("dst"))
if obfuscateEmails:
    meetings = meetings.withColumn("srcHash", md5(col("src"))) \
                   .withColumn("dstHash", md5(col("dst"))) \
                   .drop("src", "dst").selectExpr("srcHash as src", "dstHash as dst")

StatementMeta(onasynapsepool, 99, 8, Finished, Available)

In [9]:
# Join after counting all email, teamsChat, meeting
emailEdges = emails.groupBy("src", "dst").count().select(col("src").alias("src1"), col("dst").alias("dst1"), col("count").alias("numEmail"))
teamsChatEdges = teamschats.groupBy("src", "dst").count().select(col("src").alias("src2"), col("dst").alias("dst2"), col("count").alias("numTeamsChat"))
meetingEdges = meetings.groupBy("src", "dst").count().select(col("src").alias("src3"), col("dst").alias("dst3"), col("count").alias("numMeeting"))

allEdges = emailEdges.alias("e").join(teamsChatEdges.alias("t"), (col("src1") == col("src2")) & (col("dst1") == col("dst2")), "full") \
                             .join(meetingEdges.alias("m"), (col("src1") == col("src3")) & (col("dst1") == col("dst3")), "full")

StatementMeta(onasynapsepool, 99, 9, Finished, Available)

In [10]:
# Coalesce together src/dst duplicate columns after join
teamsChatToEmailRatio = 8 # interaction ratio for teamschat to email
edgesRenamed = allEdges.select(
    coalesce( *[col(c) for c in ["src1", "src2", "src3"]]).alias("Source"),
    coalesce( *[col(c) for c in ["dst1", "dst2", "dst3"]]).alias("Target"),
    col("numEmail").alias("InteractionsEmail"),
    col("numTeamsChat").alias("InteractionsMeetings"),
    col("numMeeting").alias("InteractionsTeamsChat")
).fillna(0) \
 .withColumn("Interactions", (col("InteractionsEmail") + col("InteractionsMeetings") + F.round(col("InteractionsTeamsChat")/8.0)).cast('int')) \
 .withColumn("Period", lit(period))
edgesRenamed.coalesce(1).write.option("header", True).mode("overwrite").csv(interactionsOutputPath)

StatementMeta(onasynapsepool, 99, 10, Finished, Available)

In [11]:
eventsOrganized = meetings.groupBy("src").count().withColumnRenamed("count", "NumberOfEventsOrganized")
eventsAttended = meetings.groupBy("dst").count().withColumnRenamed("count", "NumberOfEventsAttended")
emailsSent = emails.groupBy("src").count().withColumnRenamed("count", "NumberOfEmailsSent")
emailsReceived = emails.groupBy("dst").count().withColumnRenamed("count", "NumberOfEmailsReceived")
teamsChatsSent = teamschats.groupBy("src").count().withColumnRenamed("count", "NumberOfChatsSent")
teamsChatsReceived = teamschats.groupBy("dst").count().withColumnRenamed("count", "NumberOfChatsReceived")

StatementMeta(onasynapsepool, 99, 11, Finished, Available)

In [12]:
# Select user properties for output and join all raw email/teamschat/meeting counts
if obfuscateEmails:
    usersDedup = usersDedup.withColumn("EmailAddress",  md5(F.lower(col("mail"))))
else:
    usersDedup = usersDedup.withColumn("EmailAddress", F.lower(col("mail")))
usersRenamed = usersDedup.selectExpr("EmailAddress", "department as Department", "jobTitle as Title", "state as StateOrProvince",
                                     "country as Country","preferredLanguage as Languages","ptenant as TenantID")
usersJoined = usersRenamed.join(eventsOrganized, col("src") == col("EmailAddress"), "left").drop("src") \
                          .join(eventsAttended, col("dst") == col("EmailAddress"), "left").drop("dst") \
                          .join(emailsSent, col("src") == col("EmailAddress"), "left").drop("src") \
                          .join(emailsReceived, col("dst") == col("EmailAddress"), "left").drop("dst") \
                          .join(teamsChatsSent, col("src") == col("EmailAddress"), "left").drop("src") \
                          .join(teamsChatsReceived, col("dst") == col("EmailAddress"), "left").drop("dst") \
                          .fillna(0)
numUsers = usersJoined.count()

StatementMeta(onasynapsepool, 99, 12, Finished, Available)

In [13]:
# Calculate out-degrees and in-degrees based on number of connections
outDegreeEdges = edgesRenamed.where(col("Interactions") > 0).groupBy("Source").count().select(col("Source"), col("count").alias("Out-DegreeIndex"))
inDegreeEdges = edgesRenamed.where(col("Interactions") > 0).groupBy("Target").count().select(col("Target"), col("count").alias("In-DegreeIndex"))

StatementMeta(onasynapsepool, 99, 13, Finished, Available)

In [14]:
# Construct networkx graph object
import networkx as nx
edges = edgesRenamed.selectExpr("Source as src", "Target as dst")
edgesList = [(e.src, e.dst) for e in edges.collect()]
graph = nx.DiGraph()
graph.add_edges_from(edgesList)

StatementMeta(onasynapsepool, 99, 14, Finished, Available)

In [15]:
# Calculate Influence Index based on page rank
graphPageRank = nx.pagerank(graph, alpha=0.85, personalization=None, max_iter=100, tol=0.001, nstart=None, weight=None, dangling=None)

StatementMeta(onasynapsepool, 99, 15, Finished, Available)

In [16]:
# Define udf for adding page rank to dataframe
def getPageRank(x):
    return graphPageRank.get(x)
influenceIndexUdf = udf(getPageRank, t.FloatType())

StatementMeta(onasynapsepool, 99, 16, Finished, Available)

In [17]:
# Calculate Betweeness Index
# Commented out since the complexity is O(EV) where E = edges, V = vertices
# This will be slow for larger graphs, roughly above 10K users
# graphBetweenness = nx.betweenness_centrality(graph)

StatementMeta(onasynapsepool, 99, 17, Finished, Available)

In [18]:
# Define udf for adding betweeness to dataframe
# def getBetweeness(x):
#     return graphBetweenness.get(x)
# betweenessIndexUdf = udf(getBetweeness, t.FloatType())

StatementMeta(onasynapsepool, 99, 18, Finished, Available)

In [19]:
# Calculate Community Bridging Index
from networkx.algorithms import community

# Get community partitions and map all nodes to a community index
communities = community.asyn_lpa_communities(graph, weight=None, seed=None)
labelsMap = {}
label = 0
for comm in communities:
    for node in comm:
        labelsMap[node] = str(label)
    label += 1

StatementMeta(onasynapsepool, 99, 19, Finished, Available)

In [20]:
# Construct udf for mapping users to community label
def getLabel(x):
    return labelsMap.get(x)

labelUdf = udf(getLabel, t.StringType())

StatementMeta(onasynapsepool, 99, 20, Finished, Available)

In [21]:
# Counts how many communities C a user is connected to with an out edge, normalized by num of communities
# For all users, compute C / (num of Communities)
# 1 = they are connected to all communities
# 0 = they have no connections

# enrich edges by mapping target dst node to community
edgesLabelled = edges.withColumn("Community", labelUdf(col("dst"))).drop("dst").distinct()

# group on src and count how many distinct community labelled targets each src has
communityBridging = edgesLabelled.groupBy("src").count() \
                                 .withColumn("CommunityBridgeIndex", col("count") / float(len(labelsMap))).drop("count")

StatementMeta(onasynapsepool, 99, 21, Finished, Available)

In [22]:
# Join all indexes to users and output
usersEnriched = usersJoined.join(outDegreeEdges, col("Source") == col("EmailAddress"), "left").drop("Source") \
                           .join(inDegreeEdges, col("Target") == col("EmailAddress"), "left").drop("Target") \
                           .fillna(0) \
                           .withColumn("DegreeIndex", (col("In-DegreeIndex") + col("Out-DegreeIndex")) / (2 * numUsers)) \
                           .withColumn("Community", labelUdf(col("EmailAddress"))) \
                           .join(communityBridging, col("src") == col("EmailAddress"), "left").drop("src") \
                           .withColumn("InfluenceIndex", influenceIndexUdf(col("EmailAddress"))) \
                           .fillna(0) \
                           .withColumn("Period", lit(period))
usersEnriched.coalesce(1).write.option("header", True).mode("overwrite").csv(usersOutputPath)

StatementMeta(onasynapsepool, 99, 22, Finished, Available)