In [35]:
# CONFIGURATIONS

# Inputs
emailPath = "abfss://mgdc@your_storage_account.dfs.core.windows.net/email_2022-06-01_to_2022-07-01/"
teamsChatPath = "abfss://mgdc@your_storage_account.dfs.core.windows.net/teamschat_2022-06-01_to_2022-07-01/"

# Limit (for performance / throttling)
limit = 10000

# Date Format
dtformat = '%Y-%m-%dT%H:%M:%S.%f' # <<< for email and teamschat datasets

#Output Format: Can be csv or parquet
#outputFormat = "csv"
outputFormat = "parquet"

# Output Paths
outputPath = "abfss://output@your_storage_account.dfs.core.windows.net/entity_sentiment_analysis.csv"

# StartDate/EndDate for this run that is denormalized to users and interactions tables
period = "2022-06-01 to 2022-07-01"

# Whether or not to md5 hash the input user emails
obfuscateEmails = True

# Whether the input MGDC data is parquet (True) or json (False)
isParquetInput = False

# Leiden max cluster size, the maximum possible size for a detected community
leidenMaxClusterSize = 1000

# The Ignore List, if any of the data wishes to be filtered for certain entities
ignore_list = [
    'Secret Product',
    'Secret Sauce'
]


StatementMeta(esasynapsepool, 5, 36, Finished, Available)

In [36]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

from pyspark.sql.functions import coalesce, col, count, explode, lit, md5, size, udf, max, countDistinct, dense_rank, monotonically_increasing_id
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import array, when
from pyspark.sql.types import ArrayType, StringType, StructField, StructType, IntegerType, FloatType, DoubleType, LongType
from pyspark.sql import types as t
from pyspark.sql import SparkSession, Row
from pyspark.ml import PipelineModel

from requests import Request

from synapse.ml.cognitive import *
from synapse.ml import *
from synapse.ml.core.platform import *

from datetime import datetime

import re
import os
import typing

StatementMeta(esasynapsepool, 5, 37, Finished, Available)

In [37]:
# CONNECTION TO Azure Cognitive Services

# A general Cognitive Services key for Text Analytics
key = "your_language_key"
loc = "your_language_location"
endpoint = "your_language_endpoint"

# Authenticate the client using key and endpoint 
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

StatementMeta(esasynapsepool, 5, 38, Finished, Available)

In [None]:
# LOAD DATA

areEmailsLoaded = False
areTeamsChatsLoaded = False

# EMAILS
try:
    if isParquetInput == True:
        emailsRaw = spark.read.parquet(emailPath).select("createdDateTime", "sender", "uniqueBody")
    else:
        emailsRaw = spark.read.json(emailPath)
        
        
    areEmailsLoaded = True
except (Exception) as error:
    print(error)
    print("Emails data not loaded, continuing with empty emails")
    emailsSchema = StructType([StructField("createdDateTime",StringType(),True),StructField("sender", StringType(), True), StructField("uniqueBody", StringType(), True)])
    emailsRaw = spark.createDataFrame(sc.emptyRDD(), emailsSchema)


# TEAMS CHATS
try:
    if isParquetInput == True:
        teamschatsRaw = spark.read.parquet(teamsChatPath).select("createdDateTime", "sender", "uniqueBody")
    else:
        teamschatsRaw = spark.read.json(teamsChatPath)
    areTeamsChatsLoaded = True
except (Exception) as error:
    print(error)
    print("TeamsChats data not loaded, continuing with empty teams chats")
    teamschatsSchema = StructType([StructField("createdDateTime",StringType(),True),StructField("sender", StringType(), True), StructField("uniqueBody", StringType(), True)])
    teamschatsRaw = spark.createDataFrame(sc.emptyRDD(), teamschatsSchema)


if (not(areEmailsLoaded) and not(areTeamsChatsLoaded)):
    raise Exception("No Emails or TeamsChats data loaded, unable to continue. Check the file paths.")


In [39]:
# EMAIL CLEANING AND PREPPING

emailsRaw = emailsRaw.select("createdDateTime", "sender", "uniqueBody")
emailsRaw = emailsRaw.dropDuplicates()
emailsRaw = emailsRaw.withColumnRenamed("uniqueBody", "body")

emailsRaw = emailsRaw.limit(limit)

emailsRaw = emailsRaw.withColumn("createdDateTime", emailsRaw["createdDateTime"].cast(StringType()))
emailsRaw = emailsRaw.withColumn("sender", emailsRaw["sender"].cast(StringType()))
emailsRaw = emailsRaw.withColumn("body", emailsRaw["body"].cast(StringType()))


StatementMeta(esasynapsepool, 5, 40, Finished, Available)

In [40]:
# TEAMS CHATS CLEANING AND PREPPING

teamschatsRaw = teamschatsRaw.select("CreatedDateTime", "Sender", "Body")
teamschatsRaw = teamschatsRaw.dropDuplicates()

teamschatsRaw = teamschatsRaw.withColumn("CreatedDateTime", teamschatsRaw["CreatedDateTime"].cast(StringType()))
teamschatsRaw = teamschatsRaw.withColumn("Sender", teamschatsRaw["Sender"].cast(StringType()))
teamschatsRaw = teamschatsRaw.withColumn("Body", teamschatsRaw["Body"].cast(StringType()))

teamschatsRaw = teamschatsRaw.withColumnRenamed("CreatedDateTime", "createdDateTime")
teamschatsRaw = teamschatsRaw.withColumnRenamed("Sender", "sender")
teamschatsRaw = teamschatsRaw.withColumnRenamed("Body", "body")



StatementMeta(esasynapsepool, 5, 41, Finished, Available)

In [41]:
# EMAIL PARSER (UNIQUE BODY)

# given raw html email body (unique body), returns only text
def parse_email(raw_email):

    message = ""

    bracket = False
    for char in raw_email: 
        if (char == "<"):
            bracket = False

        if (bracket):
            message = message + char

        if (char == ">"):
            bracket = True

    message_with_HTML_removed = " ".join(word for word in message.split() if "HTML" not in word)

    return message_with_HTML_removed


# ***used in conjunction with a spark map function
# given a row in an email dataframe, returns the row with the body converted to only text
def parse_bodies_html_to_text(row):
        raw_email = row["body"]

        body = parse_email(raw_email)

        row_dict = row.asDict()

        row_dict.update({"body" : body})

        row = Row(**row_dict)

        return row
    
emails = emailsRaw.rdd.map(parse_bodies_html_to_text).toDF()
teamschats = teamschatsRaw.rdd.map(parse_bodies_html_to_text).toDF()

StatementMeta(esasynapsepool, 5, 42, Finished, Available)

In [42]:
# MERGE EMAIL AND TEAMS CHATS DATAFRAMES

emails_and_teamschats = emails.union(teamschats)

StatementMeta(esasynapsepool, 5, 43, Finished, Available)

In [43]:
# DATE PARSER

def parse_date(date_string, date_format):
    try:
        date_string = date_string.strip()

        date_obj = datetime.strptime(date_string, date_format)
    
        return date_obj.strftime('%Y-%m-%d')
    except ValueError:
        return date_string

StatementMeta(esasynapsepool, 5, 44, Finished, Available)

In [44]:
# MINE OPINIONS

def mine_opinions(client, documents):

    # limit all documents to the first 5000 text elements to prevent throttling
    documents = [doc[:5000] for doc in documents]

    sentiment_result = client.analyze_sentiment(documents, show_opinion_mining=True)
    entity_result = client.recognize_entities(documents)

    entity_sentiment_tuples = []

    document = 0
    while (document < len(documents)):

        categories_to_remove = ["Person", "Address", "PhoneNumber", "Email", "URL", "IP", "DateTime", "Quantity"]

        document_tuples = [(CategorizedEntity, sentiment_result[document].sentiment) for CategorizedEntity in entity_result[document].entities if CategorizedEntity.category not in categories_to_remove]

        entities = []
        for tup in document_tuples:
            if (tup[0].confidence_score > 0.6):
                entity = tup[0].text
                entity = re.sub(r'[^\w\s]', '', entity)
                entity = entity.title()
                entities.append((entity, tup[1]))

        # fixes an issue where entities are duplicated based on their subcategories
        entities_dedup = []
        [entities_dedup.append(tup) for tup in entities if tup not in entities_dedup]

        # filters out any entities included on the ignore list
        entities = [tup for tup in entities_dedup if tup[0] not in ignore_list]

        for sentence in sentiment_result[document].sentences: # break each document into sentences

            if (sentence.mined_opinions): # if sentence.mined_opinions != 0 
                for mined_opinion in sentence.mined_opinions: # go through mined_opinions            
                    target = mined_opinion.target 

                    i = 0
                    while (i < len(entities)):
                        if (target.text == entities[i][0]):
                            entities[i] = (target.text, target.sentiment)
                        i = i + 1

        entity_sentiment_tuples.append(entities)

        document = document + 1

    return entity_sentiment_tuples

StatementMeta(esasynapsepool, 5, 45, Finished, Available)

In [45]:
# DATAFRAME CREATION

def add_entities_and_sentiment_columns_to_dataframe(df):

    df = df.withColumn("entities_and_sentiments", lit(""))

    distinct_sender = df.select("sender").distinct().orderBy("sender")
    sender_numbers = distinct_sender.withColumn("sender_id", monotonically_increasing_id())
    df = df.join(sender_numbers, "sender")

    df = df.repartition(int(limit / 5))

    def gather_entities_and_sentiments(client, rows):
        copy_rows = list(rows)

        list_of_lists_of_entity_sentiment_tuples = mine_opinions(client, [row["body"] for row in copy_rows])

        updated_rows = []
        i = 0
        for row in copy_rows: 
            updated_row = Row(sender = row.sender_id, body = row.body, createdDateTime = row.createdDateTime, entities_and_sentiments = list_of_lists_of_entity_sentiment_tuples[i])
            updated_rows.append(updated_row)
            i = i + 1

        return updated_rows


    df = df.rdd.mapPartitions(lambda rows: gather_entities_and_sentiments(client, rows)).toDF()

    df = df.filter(col("entities_and_sentiments") != array([])) 
    df = df.withColumnRenamed("sender", "Sender")
    df = df.select("Sender", "createdDateTime", explode("entities_and_sentiments").alias("entity_and_sentiment"))

    df = df.withColumn("Topic", col("entity_and_sentiment")["_1"])
    df = df.withColumn("Sentiment", col("entity_and_sentiment")["_2"])

    df = df.drop("entity_and_sentiment")

    return df

StatementMeta(esasynapsepool, 5, 46, Finished, Available)

In [46]:
# OUTPUT DATAFRAME

def create_output_dataframe_with_esa_schematics(df):
    
    df = df.withColumn("Mentions", lit(1))

    df = df.withColumn("Positive", when(col("sentiment") == "positive", 1).otherwise(0))
    df = df.withColumn("Neutral", when(col("sentiment") == "neutral", 1).otherwise(0))
    df = df.withColumn("Mixed", when(col("sentiment") == "mixed", 1).otherwise(0))
    df = df.withColumn("Negative", when(col("sentiment") == "negative", 1).otherwise(0))

    df = df.drop("Sentiment")

    udf_parse_date = udf(parse_date, StringType())
    df = df.withColumn("Date Format", lit(dtformat))
    df = df.withColumn("Date", udf_parse_date(col("createdDateTime"), col("Date Format")))
    df = df.drop("createdDateTime").drop("Date Format")

    return df

StatementMeta(esasynapsepool, 5, 47, Finished, Available)

In [47]:
# OUTPUT CSV

print("Getting input dataframe...")
df = emails_and_teamschats

print("Finding entities and sentiments...")
df = add_entities_and_sentiment_columns_to_dataframe(df)

print("Creating output dataframe...")
df = create_output_dataframe_with_esa_schematics(df)

if outputFormat == "csv":
    print("Coalescing dataframe...")
    df.coalesce(1)
    print("Saving DataFrame as CSV...")
    df.write.format("csv").option("header", "true").mode("overwrite").save(outputPath)

elif outputFormat == "parquet":
    print("Coalescing dataframe...")
    df.coalesce(1)
    print("Saving DataFrame as Parquet...") 
    df.write.option("header", True).mode("overwrite").parquet(outputPath)

print("Done!")

StatementMeta(esasynapsepool, 5, 48, Finished, Available)

Getting input dataframe...


SynapseWidget(Synapse.DataFrame, e2402bf6-1de5-410b-a5ca-795bbe57de0a)

Finding entities and sentiments...
Creating output dataframe...


SynapseWidget(Synapse.DataFrame, af60e325-96bb-405b-a671-d1d1dedd9b88)

Coalescing dataframe...
Saving DataFrame as Parquet...
Done!
