In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import re

In [3]:
initial_dataset = 'datasets/emails.csv'
cleaned_dataset = 'datasets/emails_cleaned.csv'
sample_dataset = 'datasets/sample_emails.csv' # this contains with hierarchial order


In [3]:

df = pd.read_csv(initial_dataset)
df.head(5)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
print(df.iloc[22,1])


Message-ID: <26575732.1075855687756.JavaMail.evans@thyme>
Date: Mon, 2 Oct 2000 02:19:00 -0700 (PDT)
From: phillip.allen@enron.com
To: bs_stone@yahoo.com
Subject: Re: Original Sept check/closing
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: "BS Stone" <bs_stone@yahoo.com> @ ENRON
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

Brenda,

 Please use the second check as my October payment.  I have my copy of the 
original deal.  Do you want me to fax this to you?

Phillip


In [5]:
def info_part(i):
    """split infomation part out"""
    return i.split('\n\n', 1)[0]
def content_part(i):
    """split content part out"""
    return i.split('\n\n', 1)[1]
df['pre_info'] = df.message.map(info_part)
df['content'] = df.message.map(content_part)
df['test_true'] = True

words2split = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 'Mime-Version: ', 'Content-Type: ',
               'Content-Transfer-Encoding: ', 'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 'X-Folder: ', 'X-Origin: ',
               'X-FileName: ']
features_naming = [i[:-2] for i in words2split]
split_condition = '|'.join(words2split)

In [6]:
# Some emails' subject confuse the string-spliting function, so I make a little change
def duplicated_info(i):
    return i.replace(' Date: ', ' Date- ').replace(' Subject: ', ' Subject2: ').replace(' To: ',
                    ' To- ').replace(' (Subject: ', ' (Subject- ')
df['pre_info'] = df['pre_info'].map(duplicated_info)

# let's check how many categories are there in these emails
def num_part(i):
    return len(re.split(split_condition, i))
df['num_info'] = df['pre_info'].map(num_part)

# around 20k emails do not have the 'To: ' category, so I add one
def add_to(i):
    return i.replace('\nSubject: ', '\nTo: \nSubject: ')
temp_condition = (df['num_info'] == 17) | (df['num_info'] == 15)
df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_to)


# similar way to deal with the "Cc:" and "Bcc:" categories
temp_condition = (df['num_info'] == 16) | (df['num_info'] == 15)
def add_bcc(i):
    return i.replace('\nX-From: ', '\nBcc: \nX-From: ')
df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_bcc)
def add_cc(i):
    return i.replace('\nMime-Version: ', '\nCc: \nMime-Version: ')
df.loc[temp_condition, 'pre_info'] = df.loc[temp_condition, 'pre_info'].map(add_cc)

In [7]:
df['num_info'] = df['pre_info'].map(num_part)
df['num_info'].value_counts()

num_info
18    517398
5          2
6          1
Name: count, dtype: int64

In [8]:
df_remove = df.loc[df['num_info'] != 18].copy()
df = df.loc[df['num_info'] == 18].copy()

In [9]:
global feature_idx
def info_split(i):
    ## split the i th part out and remove \n for the feature
    return re.split(split_condition, i)[feature_idx+1][:-2]
def info_split_last(i):
    ## no need to remove \n for last category -- X-FileName
    return re.split(split_condition, i)[feature_idx+1]
for feature_idx in range(len(words2split)):
    if feature_idx != len(words2split) - 1:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split)
    else:
        df[features_naming[feature_idx]] = df['pre_info'].map(info_split_last) 

In [10]:
df['Content-Transfer-Encoding'].value_counts()


Content-Transfer-Encoding
7bi                            494994
quoted-printabl                 22399
base6                               4
text/plain; charset=us-asci         1
Name: count, dtype: int64

In [11]:
df_remove2 = df.loc[df['Content-Transfer-Encoding'] == 'text/plain; charset=us-asci']
df = df.loc[df['Content-Transfer-Encoding'] != 'text/plain; charset=us-asci']

In [12]:
def split_other_content(i):
    """split other forms of contents out"""
    return i.split('-------------', 1)[0]
df["has_other_content"] = df["content"].str.contains("-------------")
df["if_forwarded"] = df["content"].str.contains("------------- Forwarded")
df['content'] = df.content.map(split_other_content)

In [13]:
df = df.drop(['pre_info','test_true', 'num_info'], axis = 1).set_index("file")
df.to_csv(cleaned_dataset, index = False)

In [52]:
df = pd.read_csv(cleaned_dataset)
df['Main_To'] = df['To'].astype(str).str.split(',').str[0].str.strip()

df.head(5)


Unnamed: 0,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,...,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,Main_To
0,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,<18782981.1075855378110.JavaMail.evans@thyme,"Mon, 14 May 2001 16:39:00 -0700 (PDT",phillip.allen@enron.co,tim.belden@enron.co,,,1.0,text/plain; charset=us-asci,...,Phillip K Alle,Tim Belden <Tim Belden/Enron@EnronXGate,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False,tim.belden@enron.co
1,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,"Fri, 4 May 2001 13:51:00 -0700 (PDT",phillip.allen@enron.co,john.lavorato@enron.co,Re,,1.0,text/plain; charset=us-asci,...,Phillip K Alle,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False,john.lavorato@enron.co
2,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT",phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes,,1.0,text/plain; charset=us-asci,...,Phillip K Alle,Leah Van Arsdal,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False,leah.arsdall@enron.co
3,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...",<13505866.1075863688222.JavaMail.evans@thyme,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT",phillip.allen@enron.co,randall.gay@enron.co,,,1.0,text/plain; charset=us-asci,...,Phillip K Alle,Randall L Ga,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False,randall.gay@enron.co
4,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT",phillip.allen@enron.co,greg.piper@enron.co,Re: Hell,,1.0,text/plain; charset=us-asci,...,Phillip K Alle,Greg Pipe,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False,greg.piper@enron.co


In [57]:
filtered_df = df[(df['has_other_content'] == False) & (df['if_forwarded'] == False)]
# remove the rows if there are less than 2 " " in the content
print(f"Total rows: {len(filtered_df)}")
for i in range(50):
    # print the content of the first 50 emails
    print(f"Email {i+1}:")
    print("--------------------------------------------------")
    print(filtered_df.iloc[i]['content'])
    print("--------------------------------------------------")


Total rows: 420037
Email 1:
--------------------------------------------------
Here is our forecast

 
--------------------------------------------------
Email 2:
--------------------------------------------------
Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.


In [58]:
filtered_df = filtered_df[filtered_df['content'].str.count(' ') > 2]
filtered_df = filtered_df[~filtered_df['content'].str.contains('From:', case=False, na=False) & 
                                    ~filtered_df['content'].str.contains('To:', case=False, na=False) &
                                    ~filtered_df['content'].str.contains('Message from:', case=False, na=False) ]
# remove the rows if the size of content is more than 1000
filtered_df = filtered_df[filtered_df['content'].str.len() < 1000]
# remove rows if the from email is also present in to email or to email is null
filtered_df = filtered_df[~filtered_df['To'].isnull()]

print(f"Total rows: {len(filtered_df)}")
for i in range(50):
    # print the content of the first 50 emails
    print(f"Email {i+1}:")
    print("--------------------------------------------------")
    print(filtered_df.iloc[i]['content'])
    print("--------------------------------------------------")


Total rows: 192690
Email 1:
--------------------------------------------------
Here is our forecast

 
--------------------------------------------------
Email 2:
--------------------------------------------------
Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.


In [59]:
sample_emails = filtered_df.sample(n=25000, random_state=42)  # Use random_state for reproducibility
sample_emails['Cc'] = sample_emails['Cc'].replace(["", None], "None")

# Save to CSV
sample_emails.to_csv(sample_dataset, index=False)

print("Sample emails saved to sample_emails.csv")


Sample emails saved to sample_emails.csv


In [60]:
sample_emails.head(50)


Unnamed: 0,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,...,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,Main_To
86994,Message-ID: <31664984.1075852142386.JavaMail.e...,I believe all the bugs have been worked out of...,<31664984.1075852142386.JavaMail.evans@thyme,"Wed, 17 Oct 2001 10:19:20 -0700 (PDT",kate.symes@enron.co,shift.dl-portland@enron.co,Revised EOL Download - PLEASE US,,1.0,text/plain; charset=us-asci,...,"Symes, Kate </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",DL-Portland Real Time Shift </O=ENRON/OU=NA/CN...,,,"\CDEAN2 (Non-Privileged)\Dean, Craig\Inbo",DEAN-,CDEAN2 (Non-Privileged).pst,False,False,shift.dl-portland@enron.co
144846,Message-ID: <5115047.1075855610475.JavaMail.ev...,Yahoo! News\tEdit Breaking News Alerts - Yahoo...,<5115047.1075855610475.JavaMail.evans@thyme,"Fri, 4 May 2001 23:13:00 -0700 (PDT",alerts-breakingnews@yahoo-inc.co,mike.grigsby@enron.co,Yahoo! Breaking New,,1.0,text/plain; charset=us-asci,...,"""Yahoo! Alerts - Breaking News"" <alerts-breaki...",mike.grigsby@enron.co,,,\Michael_Grigsby_Jun2001\Notes Folders\Notes inbo,Grigsby-,mgrigsb.nsf,False,False,mike.grigsby@enron.co
427691,Message-ID: <12042696.1075844566306.JavaMail.e...,Are you still working every other day? Please...,<12042696.1075844566306.JavaMail.evans@thyme,"Mon, 11 Sep 2000 06:22:00 -0700 (PDT",sara.shackleton@enron.co,michael.khajeh-noori@enron.co,SIT,,1.0,text/plain; charset=us-asci,...,Sara Shackleto,Michael Khajeh-Noor,,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-,sshackle.nsf,False,False,michael.khajeh-noori@enron.co
322555,Message-ID: <17502456.1075846133538.JavaMail.e...,Kay -\n\nI've reached a verbal agreement with ...,<17502456.1075846133538.JavaMail.evans@thyme,"Thu, 9 Nov 2000 10:05:00 -0800 (PST",mitch.robinson@enron.co,kay.mann@enron.co,SWPC Agreemen,,1.0,text/plain; charset=us-asci,...,Mitch Robinso,Kay Man,,,\Kay_Mann_June2001_4\Notes Folders\Westinghous,MANN-,kmann.nsf,False,False,kay.mann@enron.co
81652,Message-ID: <8313323.1075843861667.JavaMail.ev...,This copy should correct all of the the typos ...,<8313323.1075843861667.JavaMail.evans@thyme,"Wed, 7 Mar 2001 11:11:00 -0800 (PST",jeff.dasovich@enron.co,jeff.dasovich@enron.co,Corrected Version of Scenario,"harry.kingerski@enron.com, iris.waser@enron.co...",1.0,text/plain; charset=us-asci,...,Jeff Dasovic,Jeff Dasovic,"Harry Kingerski, Iris Waser, James D Steffes, ...",,\Jeff_Dasovich_June2001\Notes Folders\Sen,DASOVICH-,jdasovic.nsf,False,False,jeff.dasovich@enron.co
325128,Message-ID: <30868936.1075852949613.JavaMail.e...,The report named: NG - PROPT P/L <http://trv.c...,<30868936.1075852949613.JavaMail.evans@thyme,"Fri, 19 Oct 2001 14:46:23 -0700 (PDT",errol.mclaughlin@enron.co,"john.arnold@enron.com, bilal.bajwa@enron.com, ...",TRV Notification: (NG - PROPT P/L - 10/19/2001,,1.0,text/plain; charset=us-asci,...,"McLaughlin Jr., Errol </O=ENRON/OU=NA/CN=RECIP...","Arnold, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,\LMAY2 (Non-Privileged)\Inbo,May-,LMAY2 (Non-Privileged).pst,False,False,john.arnold@enron.com
216804,Message-ID: <23840170.1075856432272.JavaMail.e...,Hi everyone.\n\nPlease rsvp to me by tomorrow ...,<23840170.1075856432272.JavaMail.evans@thyme,"Thu, 3 May 2001 02:39:00 -0700 (PDT",shirley.crenshaw@enron.co,"vince.kaminski@enron.com, stinson.gibner@enron...",Research Get-Together at Sandeep Kohli's New Hom,,1.0,text/plain; charset=us-asci,...,Shirley Crensha,"Vince J Kaminski, Stinson Gibner, Pinnamaneni ...",,,\Vincent_Kaminski_Jun2001_2\Notes Folders\Disc...,Kaminski-,vkamins.nsf,False,False,vince.kaminski@enron.com
384495,Message-ID: <21558926.1075859514971.JavaMail.e...,I have been asked by numerous parties to put t...,<21558926.1075859514971.JavaMail.evans@thyme,"Thu, 20 Jan 2000 09:16:00 -0800 (PST",stuart.zisman@enron.co,"elizabeth.sager@enron.com, greg.johnston@enron...",Draft of Unit Contingent Term Sheet Outlin,,1.0,text/plain; charset=us-asci,...,Stuart Zisma,"Elizabeth Sager, Greg Johnston, Scott Healy, K...",,,\Elizabeth_Sager_Dec2000\Notes Folders\All doc...,Sager-,esager.nsf,False,False,elizabeth.sager@enron.com
468495,Message-ID: <11681742.1075841766782.JavaMail.e...,Deal # 460860 is entered as flowing every day ...,<11681742.1075841766782.JavaMail.evans@thyme,"Thu, 16 Nov 2000 05:49:00 -0800 (PST",sharen.cason@enron.co,"kate.symes@enron.com, carla.hoffman@enron.co",,,1.0,text/plain; charset=us-asci,...,Sharen Caso,"Kate Symes, Carla Hoffma",,,\kate symes 6-27-02\Notes Folders\Discussion t...,SYMES-,kate symes 6-27-02.nsf,False,False,kate.symes@enron.com
510349,Message-ID: <4890830.1075839946930.JavaMail.ev...,"We are 7 MW long all hours at Montana System, ...",<4890830.1075839946930.JavaMail.evans@thyme,"Fri, 1 Jun 2001 16:41:44 -0700 (PDT",cara.semperger@enron.co,shift.portland@enron.co,We have length at Colstrip all hours on Monday...,"diana.scholtes@enron.com, sean.crandall@enron.co",1.0,text/plain; charset=us-asci,...,"Semperger, Cara </O=ENRON/OU=NA/CN=RECIPIENTS/...",Portland Shift </O=ENRON/OU=NA/CN=RECIPIENTS/C...,"Scholtes, Diana </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\ExMerge - Williams III, Bill\Preschedul",WILLIAMS-W,,False,False,shift.portland@enron.co


In [61]:
for i in range(50):
    # print the content of the first 50 emails
    print(f"Email {i+1}:")
    print("--------------------------------------------------")
    print(sample_emails.iloc[i]['content'])
    print("--------------------------------------------------")

Email 1:
--------------------------------------------------
I believe all the bugs have been worked out of this one. It pulls in inter-desk swaps and includes all the features you guys have requested - sorts by delivery point or hour, able to refresh, etc.

I've saved the download as EOL Download (in P://Real Time/Increment/Windows 2000) and deleted the file called NEW EOL DOWNLOAD. Test is out and let me know what you think.

Thanks.

Kate Symes
Real Time Trading Support
Office/503-464-7744
Cell/503-819-2181
Fax/503-464-7996


--------------------------------------------------
Email 2:
--------------------------------------------------
Yahoo! News	Edit Breaking News Alerts - Yahoo!
	
	

	Breaking News Alert	 edit
DAMASCUS, Syria _ Pope John Paul II arrives in Syria. 	
	
	Click here!
	[IMAGE]
		


If you no longer wish to receive this alert, click here to unsubscribe.
If you have questions, send us feedback.

Copyright , 1994-2001 Yahoo! Inc. All rights reserved. Yahoo Privacy Policy
-

# Hierchial Order

In [62]:
import pandas as pd
import networkx as nx

# Load dataset
df = pd.read_csv(sample_dataset)

# Fill missing 'Cc' with empty string
df['Cc'] = df['Cc'].fillna("")

# Parse edges from 'From' -> 'To' and 'Cc' (but not interpreting direction as superiority)
edges = []
for _, row in df.iterrows():
    sender = row['From']
    to_list = str(row['To']).split(",") if pd.notna(row['To']) else []
    cc_list = str(row['Cc']).split(",") if pd.notna(row['Cc']) else []

    for recipient in to_list + cc_list:
        recipient = recipient.strip()
        if recipient:
            edges.append((sender, recipient))

# Build directed graph
G = nx.DiGraph()
G.add_edges_from(edges)

# Compute centrality metrics
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
pagerank = nx.pagerank(G)


In [63]:
metrics = {
    "degree": degree_centrality,
    "betweenness": betweenness_centrality,
    "closeness": closeness_centrality,
    "pagerank": pagerank
}

# Step 4: Function to infer hierarchy
def infer_hierarchy_label(sender, recipient):
    if sender not in G or recipient not in G:
        return "Unknown"
    
    sender_score = sum(metric.get(sender, 0) for metric in metrics.values())
    recipient_score = sum(metric.get(recipient, 0) for metric in metrics.values())

    if sender_score > recipient_score:
        return "Sender higher"
    elif sender_score < recipient_score:
        return "Recipient higher"
    else:
        return "Similar level"

# Step 5: Apply to each row
hierarchy_labels = []
for _, row in df.iterrows():
    sender = row['From']
    recipient = row['Main_To']

    if pd.isna(sender) or pd.isna(recipient):
        hierarchy_labels.append("Unknown")
    else:
        label = infer_hierarchy_label(sender.strip(), recipient.strip())
        hierarchy_labels.append(label)

# Step 6: Add result and save to file
df['Hierarchy_Label'] = hierarchy_labels


df = df[df['Hierarchy_Label'] != 'Unknown']

df.to_csv(sample_dataset, index=False)

print(f"Updated file saved to: {sample_dataset}")
df.head(5)

Updated file saved to: datasets/sample_emails.csv


Unnamed: 0,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,...,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,Main_To,Hierarchy_Label
0,Message-ID: <31664984.1075852142386.JavaMail.e...,I believe all the bugs have been worked out of...,<31664984.1075852142386.JavaMail.evans@thyme,"Wed, 17 Oct 2001 10:19:20 -0700 (PDT",kate.symes@enron.co,shift.dl-portland@enron.co,Revised EOL Download - PLEASE US,,1.0,text/plain; charset=us-asci,...,DL-Portland Real Time Shift </O=ENRON/OU=NA/CN...,,,"\CDEAN2 (Non-Privileged)\Dean, Craig\Inbo",DEAN-,CDEAN2 (Non-Privileged).pst,False,False,shift.dl-portland@enron.co,Sender higher
1,Message-ID: <5115047.1075855610475.JavaMail.ev...,Yahoo! News\tEdit Breaking News Alerts - Yahoo...,<5115047.1075855610475.JavaMail.evans@thyme,"Fri, 4 May 2001 23:13:00 -0700 (PDT",alerts-breakingnews@yahoo-inc.co,mike.grigsby@enron.co,Yahoo! Breaking New,,1.0,text/plain; charset=us-asci,...,mike.grigsby@enron.co,,,\Michael_Grigsby_Jun2001\Notes Folders\Notes inbo,Grigsby-,mgrigsb.nsf,False,False,mike.grigsby@enron.co,Recipient higher
2,Message-ID: <12042696.1075844566306.JavaMail.e...,Are you still working every other day? Please...,<12042696.1075844566306.JavaMail.evans@thyme,"Mon, 11 Sep 2000 06:22:00 -0700 (PDT",sara.shackleton@enron.co,michael.khajeh-noori@enron.co,SIT,,1.0,text/plain; charset=us-asci,...,Michael Khajeh-Noor,,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-,sshackle.nsf,False,False,michael.khajeh-noori@enron.co,Sender higher
3,Message-ID: <17502456.1075846133538.JavaMail.e...,Kay -\n\nI've reached a verbal agreement with ...,<17502456.1075846133538.JavaMail.evans@thyme,"Thu, 9 Nov 2000 10:05:00 -0800 (PST",mitch.robinson@enron.co,kay.mann@enron.co,SWPC Agreemen,,1.0,text/plain; charset=us-asci,...,Kay Man,,,\Kay_Mann_June2001_4\Notes Folders\Westinghous,MANN-,kmann.nsf,False,False,kay.mann@enron.co,Recipient higher
4,Message-ID: <8313323.1075843861667.JavaMail.ev...,This copy should correct all of the the typos ...,<8313323.1075843861667.JavaMail.evans@thyme,"Wed, 7 Mar 2001 11:11:00 -0800 (PST",jeff.dasovich@enron.co,jeff.dasovich@enron.co,Corrected Version of Scenario,"harry.kingerski@enron.com, iris.waser@enron.co...",1.0,text/plain; charset=us-asci,...,Jeff Dasovic,"Harry Kingerski, Iris Waser, James D Steffes, ...",,\Jeff_Dasovich_June2001\Notes Folders\Sen,DASOVICH-,jdasovic.nsf,False,False,jeff.dasovich@enron.co,Similar level


In [4]:
import pandas as pd
import networkx as nx

# Load dataset
df = pd.read_csv(sample_dataset)

# Fill missing 'Cc' with empty string
df['Cc'] = df['Cc'].fillna("")

# Parse edges from 'From' -> 'To' and 'Cc'
edges = []
for _, row in df.iterrows():
    sender = row['From']
    to_list = str(row['To']).split(",") if pd.notna(row['To']) else []
    cc_list = str(row['Cc']).split(",") if pd.notna(row['Cc']) else []

    for recipient in to_list + cc_list:
        recipient = recipient.strip()
        if recipient:
            edges.append((sender, recipient))

# Build directed graph
G = nx.DiGraph()
G.add_edges_from(edges)

# Compute centrality metrics
degree = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)
closeness = nx.closeness_centrality(G)
pagerank = nx.pagerank(G)

# Normalize each metric using min-max scaling
def normalize_metric(metric_dict):
    values = list(metric_dict.values())
    min_val = min(values)
    max_val = max(values)
    range_val = max_val - min_val if max_val > min_val else 1e-9
    return {k: (v - min_val) / range_val for k, v in metric_dict.items()}

degree_norm = normalize_metric(degree)
betweenness_norm = normalize_metric(betweenness)
closeness_norm = normalize_metric(closeness)
pagerank_norm = normalize_metric(pagerank)

# Define weights for each centrality metric
weights = {
    "degree": 0.1,
    "betweenness": 0.20,
    "closeness": 0.20,
    "pagerank": 0.5
}

# Aggregate normalized and weighted centralities
combined_score = {}
for node in G.nodes():
    combined_score[node] = (
        weights["degree"] * degree_norm.get(node, 0) +
        weights["betweenness"] * betweenness_norm.get(node, 0) +
        weights["closeness"] * closeness_norm.get(node, 0) +
        weights["pagerank"] * pagerank_norm.get(node, 0)
    )

# Step 4: Function to infer hierarchy
def infer_hierarchy_label(sender, recipient):
    if sender not in G or recipient not in G:
        return "Unknown"
    
    sender_score = combined_score.get(sender, 0)
    recipient_score = combined_score.get(recipient, 0)

    if sender_score > recipient_score:
        return "Sender higher"
    elif sender_score < recipient_score:
        return "Recipient higher"
    else:
        return "Similar level"

# Step 5: Apply to each row
hierarchy_labels = []
for _, row in df.iterrows():
    sender = row['From']
    recipient = row['Main_To']

    if pd.isna(sender) or pd.isna(recipient):
        hierarchy_labels.append("Unknown")
    else:
        label = infer_hierarchy_label(sender.strip(), recipient.strip())
        hierarchy_labels.append(label)

# Step 6: Add result and save to file
df['Hierarchy_Label'] = hierarchy_labels

# Remove rows with 'Unknown' label
df = df[df['Hierarchy_Label'] != 'Unknown']

# Save result
df.to_csv(sample_dataset, index=False)
print(f"Updated file saved to: {sample_dataset}")

df.head(5)


Updated file saved to: datasets/sample_emails.csv


Unnamed: 0,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,...,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded,Main_To,Hierarchy_Label
0,Message-ID: <31664984.1075852142386.JavaMail.e...,I believe all the bugs have been worked out of...,<31664984.1075852142386.JavaMail.evans@thyme,"Wed, 17 Oct 2001 10:19:20 -0700 (PDT",kate.symes@enron.co,shift.dl-portland@enron.co,Revised EOL Download - PLEASE US,,1.0,text/plain; charset=us-asci,...,DL-Portland Real Time Shift </O=ENRON/OU=NA/CN...,,,"\CDEAN2 (Non-Privileged)\Dean, Craig\Inbo",DEAN-,CDEAN2 (Non-Privileged).pst,False,False,shift.dl-portland@enron.co,Sender higher
1,Message-ID: <5115047.1075855610475.JavaMail.ev...,Yahoo! News\tEdit Breaking News Alerts - Yahoo...,<5115047.1075855610475.JavaMail.evans@thyme,"Fri, 4 May 2001 23:13:00 -0700 (PDT",alerts-breakingnews@yahoo-inc.co,mike.grigsby@enron.co,Yahoo! Breaking New,,1.0,text/plain; charset=us-asci,...,mike.grigsby@enron.co,,,\Michael_Grigsby_Jun2001\Notes Folders\Notes inbo,Grigsby-,mgrigsb.nsf,False,False,mike.grigsby@enron.co,Recipient higher
2,Message-ID: <12042696.1075844566306.JavaMail.e...,Are you still working every other day? Please...,<12042696.1075844566306.JavaMail.evans@thyme,"Mon, 11 Sep 2000 06:22:00 -0700 (PDT",sara.shackleton@enron.co,michael.khajeh-noori@enron.co,SIT,,1.0,text/plain; charset=us-asci,...,Michael Khajeh-Noor,,,\Sara_Shackleton_Dec2000_June2001_1\Notes Fold...,SHACKLETON-,sshackle.nsf,False,False,michael.khajeh-noori@enron.co,Sender higher
3,Message-ID: <17502456.1075846133538.JavaMail.e...,Kay -\n\nI've reached a verbal agreement with ...,<17502456.1075846133538.JavaMail.evans@thyme,"Thu, 9 Nov 2000 10:05:00 -0800 (PST",mitch.robinson@enron.co,kay.mann@enron.co,SWPC Agreemen,,1.0,text/plain; charset=us-asci,...,Kay Man,,,\Kay_Mann_June2001_4\Notes Folders\Westinghous,MANN-,kmann.nsf,False,False,kay.mann@enron.co,Recipient higher
4,Message-ID: <8313323.1075843861667.JavaMail.ev...,This copy should correct all of the the typos ...,<8313323.1075843861667.JavaMail.evans@thyme,"Wed, 7 Mar 2001 11:11:00 -0800 (PST",jeff.dasovich@enron.co,jeff.dasovich@enron.co,Corrected Version of Scenario,"harry.kingerski@enron.com, iris.waser@enron.co...",1.0,text/plain; charset=us-asci,...,Jeff Dasovic,"Harry Kingerski, Iris Waser, James D Steffes, ...",,\Jeff_Dasovich_June2001\Notes Folders\Sen,DASOVICH-,jdasovic.nsf,False,False,jeff.dasovich@enron.co,Similar level


In [4]:
df = pd.read_csv(sample_dataset)


# print 10 each with different hierarchy labels and their correpsonding content
hierarchy_labels = df['Hierarchy_Label'].unique()
for label in hierarchy_labels:
    print(f"Label: {label}")
    print("--------------------------------------------------")
    sample = df[df['Hierarchy_Label'] == label].sample(n=30, random_state=42)
    for i, row in sample.iterrows():
        print(f"Email {i}:")
        print("--------------------------------------------------")
        print(row['content'])
        print("--------------------------------------------------")
    print("\n\n")

Label: Sender higher
--------------------------------------------------
Email 19879:
--------------------------------------------------
Hi Al,

My suggestion is that you capture the change order with change order #1 for 
PSCO's break out contract.  If it needs to be paid before year end (which it 
probably does) you can put a payment date of December 20th, and it will be 
paid out of TurboPark.  This approach has been discussed with Lee Johnson at 
GE, and he is fine with it.

Thanks,

Kay
--------------------------------------------------
Email 11766:
--------------------------------------------------
We would like to begin trading a new product on EOL.  The location - PG&E 
Topock -  has already been set up, but we would like to make a small change 
to the long description.  The line which reads  -  "The transaction is for 
delivery at PG&E, El Paso."  should be changed to "The transaction is for 
delivery at PG&E Topock"    Please let me know when we can review the change, 
as Mike 