Copyright (c) Microsoft Corporation.

Licensed under the MIT License.

In [None]:
from pyspark.sql.functions import col, split,regexp_replace,reverse,initcap, lit, to_date, lower

In [None]:
data_lake_account_name = ''
file_system_name = ''
# name of the secret in key vault for your text analytics key
text_analytics_Key = ''
# name of the secret in key vault for your text analytics endpoint
text_analytics_endpoint = ''
# region your text analytics resource is in 
text_analytics_region = ''  

In [None]:
file_path = f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/enroncsvdata/enron_05_17_2015_with_labels_v2.csv"
df = spark.read.load(file_path, format='csv', header=True,quote='"', delimiter=',',escape='"',multiline=True)

df = df.select('Message-ID', 'Date','From','To','Subject','X-From','X-To','X-Folder','content','user')

df = df.withColumn('X-Folder', reverse(split(col('X-Folder'), "\'")).getItem(0))

df = df.withColumn('From', split(col('From'), "'").getItem(1))
df = df.withColumn('To', split(col('To'), "'").getItem(1))


df = df.withColumnRenamed('From','source') \
       .withColumnRenamed('To','target')

df = df.withColumn('sourcename', initcap(regexp_replace(split(col('source'), "@").getItem(0),'\.',' ')))
df = df.withColumn('targetname', initcap(regexp_replace(split(col('target'), "@").getItem(0),'\.',' ')))

df = df.withColumn('sourcedomain', split(col('source'), "@").getItem(1))
df = df.withColumn('targetdomain', split(col('target'), "@").getItem(1))


df = df.limit(10000)

df = df.filter(df["source"].isNotNull())
df = df.filter(df["target"].isNotNull())

import uuid

# map column names 
df = df.withColumn('Id', col('Message-ID')) \
       .withColumn('puser', lit(str(uuid.uuid4()))) \
       .withColumn('ParentFolderId', lit(str(uuid.uuid4()))) \
       .withColumn('ConversationId', lit(str(uuid.uuid4()))) \
       .withColumn('Subject', col('Subject')) \
       .withColumn('UniqueBody', col('content')) \
       .withColumn('CreatedDateTime', col('Date')) \
       .withColumn('LastModifiedDateTime', col('Date')) \
       .withColumn('SentDateTime', col('Date')) \
       .withColumn('SentDate', to_date('Date')) \
       .withColumn('Sender', lower('source')) \
       .withColumn('Recipient', lower('target')) \
       .withColumn('RType', lit('To')) \
       .withColumn('Sender_Domain', col('sourcedomain')) \
       .withColumn('Recipient_Domain', col('targetdomain')) \
       .withColumn('ToRecipients_cnt', lit(1)) \
       .withColumn('CcRecipients_cnt', lit(0)) \
       .withColumn('BccRecipients_cnt', lit(0)) \
       .withColumn('Recipients_cnt', (col('ToRecipients_cnt') + col('CcRecipients_cnt') + col('BccRecipients_cnt'))) \
       .withColumn('LoadFolderPath', lit('test_path')) \
       .select('Id','puser','ParentFolderId','ConversationId','Subject','UniqueBody','CreatedDateTime','LastModifiedDateTime','SentDateTime',
       'SentDate','Sender','Recipient','RType','Sender_Domain','Recipient_Domain','ToRecipients_cnt','CcRecipients_cnt','BccRecipients_cnt','Recipients_cnt','LoadFolderPath')

df.write.mode("overwrite").saveAsTable("messages_all_data")



In [None]:
df_events = spark.sql('''select Id, puser, Id as ICalUid, Subject, False as Recurrence, False as IsCancelled,
             SentDateTime as Start, SentDate as Start_Date, SentDateTime as End, Sender as Organizer, Recipient as Attendee, 
             None as Attendee_Response, 'Required' as Attendee_Type, Sender_Domain as Organizer_Domain, Recipient_Domain as Attendee_Domain,
             2 as Attendees_cnt, 'Remote' as location_flag'''

df_events.write.mode("overwrite").saveAsTable("events_all_data")

In [None]:
df_messages = spark.sql('''select * from messages_all_data where Sender in 
(
    select Sender from messages_all_data
    where Sender_domain = 'enron.com'
    group by Sender
    order by count(distinct Recipient_Domain) desc
    limit 100
)''')

df_messages.write.mode("overwrite").saveAsTable("messagestempdata")


df_messages = spark.sql('''select * from messagestempdata where Recipient in 
(
    select Recipient from messagestempdata
    group by Recipient
    order by count(distinct Sender_Domain) desc
    limit 100
)''')

df_messages.write.mode("overwrite").saveAsTable("messagesdata")

In [None]:
from pyspark.sql.functions import monotonically_increasing_id, col
df_events = spark.sql('''select Id, puser, Id as ICalUid, Subject, False as Recurrence, False as IsCancelled,
             SentDateTime as Start, SentDate as Start_Date, SentDateTime as End, Sender as Organizer, Recipient as Attendee, 
             'Accepted' as Attendee_Response, 'Required' as Attendee_Type, Sender_Domain as Organizer_Domain, Recipient_Domain as Attendee_Domain,
             2 as Attendees_cnt, 'Remote' as location_flag from messagesdata''')

df_events = df_events.withColumn("idx", monotonically_increasing_id())
df_events = df_events.filter((col('idx') % 2 == 0) | (col('idx') % 3 == 0) )

df_events = df_events.drop('idx')
df_events.write.mode("overwrite").saveAsTable("eventsdata")

In [None]:
df_source_domains = spark.sql('select distinct Sender_Domain from messagesdata')
df_source_domains.write.mode("overwrite").saveAsTable("sender_domains")

df_target_domains = spark.sql('select distinct Recipient_Domain from messagesdata')
df_target_domains.write.mode("overwrite").saveAsTable("recipient_domains")


sql_str = '''select distinct domain from (
    select distinct Sender_Domain as domain from messagesdata 
    union all 
    select distinct Recipient_Domain as domain from messagesdata 
)'''
df_all_domains = spark.sql(sql_str)
df_all_domains.write.mode("overwrite").saveAsTable("all_domains")

In [None]:
sql_str = '''
select distinct employee_name,employee_email,employee_domain
from
(
    select distinct replace(split(sender, '@')[0],'.', ' ') as employee_name, Sender as employee_email, Sender_domain as employee_domain
    from messagesdata where Sender_Domain = \'enron.com\'

    union all 

    select distinct replace(split(Recipient, '@')[0],'.', ' ') as employee_name, Recipient as employee_email, Recipient_domain as employee_domain
    from messagesdata where Recipient_domain = \'enron.com\'
)
'''

df_employees = spark.sql(sql_str)
df_employees.write.mode("overwrite").saveAsTable("employees")

In [None]:
sql_str = '''
select distinct contact_name,contact_email,contact_domain
from
(
    select distinct replace(split(sender, '@')[0],'.', ' ') as contact_name, sender as contact_email, sender_domain as contact_domain
    from messagesdata where sender_domain != \'enron.com\'

    union all 

    select distinct replace(split(Recipient, '@')[0],'.', ' ') as contact_name, Recipient as contact_email, Recipient_domain as contact_domain
    from messagesdata where Recipient_domain != \'enron.com\'
)
'''
df_contacts = spark.sql(sql_str)
df_contacts.write.mode("overwrite").saveAsTable("contacts")

In [None]:
sql_str = '''
select distinct name,email,domain,contact_type
from
(
    select distinct employee_name as name, employee_email as email, employee_domain as domain, 'employee' as contact_type
    from employees
    union all 
    select distinct contact_name as name, contact_email as email, contact_domain as domain, 'contact' as contact_type
    from contacts
)
'''
df_emp_contacts = spark.sql(sql_str)
df_emp_contacts.write.mode("overwrite").saveAsTable("emp_contacts")

In [None]:
df_links = spark.sql('''select sender, recipient, count(*) as emailcount from messagesdata group by sender, recipient''')
display(df_links.take(2))

In [None]:
import networkx as nx
G = nx.DiGraph()

for row in df_links.collect():
    G.add_edge(row.sender, row.recipient, weight= 1 / int(row.emailcount))


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import networkx as nx

def get_eigen_centrality(G):
    ec = nx.eigenvector_centrality(G,weight='weight')
    df_ec = spark.createDataFrame(ec.items())
    cols = ['Emp','EigenCentrality']
    df_ec = df_ec.toDF(*cols)
    df_ec = df_ec.withColumn('EigenCentrality', round((col('EigenCentrality') * lit(10)),3))
    df_ec = df_ec.sort(df_ec.EigenCentrality.desc()) 
    return df_ec
    
df_ec = get_eigen_centrality(G)
# display(df_ec)
df_ec.write.mode("overwrite").saveAsTable("eigen_centralities")                 

In [None]:
def get_degree_centrality(G):
    ec = nx.degree_centrality(G)
    df_ec = spark.createDataFrame(ec.items())
    cols = ['Emp','DegreeCentrality']
    df_ec = df_ec.toDF(*cols)
    df_ec = df_ec.withColumn('DegreeCentrality', round((col('DegreeCentrality') * lit(10)),3))
    df_ec = df_ec.sort(df_ec.DegreeCentrality.desc()) 
    return df_ec
    
df_dc = get_degree_centrality(G)
df_dc.write.mode("overwrite").saveAsTable("degree_centralities")                 

In [None]:
TEXT_ANALYTICS_KEY = TokenLibrary.getSecret(keyvault_name,text_analytics_key,KeyVaultLinkedServiceName)
TEXT_ANALYTICS_ENDPOINT = TokenLibrary.getSecret(keyvault_name,text_analytics_endpoint,KeyVaultLinkedServiceName)
TEXT_ANALYTICS_REGION = TokenLibrary.getSecret(keyvault_name,text_analytics_region,KeyVaultLinkedServiceName)

df_data = spark.sql('''select sender, recipient, subject,uniquebody from enrondata limit 1000''')


from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
def authenticate_client():
    """
    Returns: text analytics client
    """
    ta_credential = AzureKeyCredential(TEXT_ANALYTICS_KEY)
    text_analytics_client = TextAnalyticsClient(
            endpoint=TEXT_ANALYTICS_ENDPOINT, 
            credential=ta_credential)
    return text_analytics_client
text_analytics_client = authenticate_client()

def get_sentiment(inp_text):
    documents = [inp_text]
    response = text_analytics_client.analyze_sentiment(documents = documents)[0]  
    try:
        overallscore = response.confidence_scores.positive + (0.5*response.confidence_scores.neutral) 
        return response.sentiment, overallscore
    except Exception as err:
        print("Encountered Sentiment exception. {}".format(err))
        return "Neutral",0.5
def get_ner(inp_text):
    try:
        documents = [inp_text]
        result = text_analytics_client.recognize_entities(documents = documents)[0]  
        return [{"text": x.text, "category": x.category, "subcategory": x.subcategory, "length": x.length, "offset": x.offset, "confidence_score": x.confidence_score} for x in result.entities]
    except Exception as err:
        print("Encountered NER exception. {}".format(err))
    return []
def get_key_phrases(inp_text):
    try:
      documents = [inp_text]
      response = text_analytics_client.extract_key_phrases(documents = documents)[0] 
      if not response.is_error:
          return response.key_phrases
      else:
          print(response.id, response.error)
    except Exception as err:
      print("Encountered Translation exception. {}".format(err))
    return []

from pyspark.sql import Row

schema = MapType(StringType(),ArrayType(MapType(StringType(),StringType())))
        
def get_ners(content):
    named_entities = []
    named_entities = get_ner(content) 
    named_entity_obj = {"en": named_entities}
    return(named_entity_obj)

get_ner_udf = udf(lambda content: get_ners(content),returnType=schema)

df_ner = df_data.select(['sender', 'recipient', 'subject','content']) \
                .withColumn('NER', get_ner_udf(col("content"))) \
                .select(explode(col("NER")).alias("language","ner1"),'sender','recipient') \
                .select(explode(col("ner1")).alias("ner2"),'sender','recipient','language') \
                .select(['sender','recipient','language',col("ner2.text").alias("value"),col("ner2.subcategory").alias("subcategory"), \
                col("ner2.offset").alias("offset"),col("ner2.confidence_score").alias("confidence_score"), col("ner2.category").alias("category"), \
                col("ner2.length").alias("length")])

df_ner.write.mode("append").saveAsTable("messages_entities") 

In [None]:
#Spacy Text Analytics

# import subprocess
# import sys
# process = subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# # print(process.stdout.decode('utf-8'))

# df_data = spark.sql('''select sender, recipient, subject,uniquebody from messagesdata''')
# # display(df_data.take(2))

# import spacy 
# nlp = spacy.load("en_core_web_sm")

# def get_ner(inp_text):
#     try:  
#         doc = nlp(inp_text)
#         result = []
#         for ent in doc.ents:
#             if ent.label_ in (['PERSON','ORG','GPE']):
#                 ent1 = {
#                     "text": ent.text,
#                     "category": ent.label_,
#                     "subcategory": ent.label_,
#                     "length": 0,
#                     "offset": 0,
#                     "confidence_score": 1.0
#                 }
#                 result.append(ent1)
#         return [{"text": x['text'], "category": x['category'], "subcategory": x['subcategory'], "length": ['length'], "offset": x['offset'], "confidence_score": x['confidence_score']} for x in result]
#         #return [{"text": x.text, "category": x.category, "subcategory": x.subcategory, "length": x.length, "offset": x.offset, "confidence_score": x.confidence_score} for x in result]
#     except Exception as err:
#         print("Encountered NER exception. {}".format(err))
#         return []


# from pyspark.sql.types import *
# from pyspark.sql.functions import *

# schema = MapType(StringType(),ArrayType(MapType(StringType(),StringType())))
        
# def get_ners(content):
#     named_entities = []
#     named_entities = get_ner(content) 
#     named_entity_obj = {"en": named_entities}
#     return(named_entity_obj)

# get_ner_udf = udf(lambda content: get_ners(content),returnType=schema)

# df_ner = df_data.select(['sender', 'recipient', 'subject','uniquebody']) \
#                 .withColumn('NER', get_ner_udf(col("uniquebody"))) \
#                 .select(explode(col("NER")).alias("language","ner1"),'sender','recipient') \
#                 .select(explode(col("ner1")).alias("ner2"),'sender','recipient','language') \
#                 .select(['sender','recipient','language',col("ner2.text").alias("value"),col("ner2.subcategory").alias("subcategory"), \
#                 col("ner2.offset").alias("offset"),col("ner2.confidence_score").alias("confidence_score"), col("ner2.category").alias("category"), \
#                 col("ner2.length").alias("length")])

# df_ner.write.mode("overwrite").saveAsTable("messages_entities")