# case study: Enron

In [None]:
from utils_WF import *
from utils_Enron import *

In [None]:
# read from Enron-data

enron = pd.read_csv('Enron-data/emails_processed.csv')  # obtained and processed from https://www.cs.cmu.edu/~enron/
enron['Date'] = pd.to_datetime(enron['Date'])  # convert Date to datetime
enron = enron[enron['From'] != enron['To']]  # remove self-to-self emails
enron

In [None]:
employees = sorted(set(enron.From) | set(enron.To))
employees

In [None]:
pos = pickle.load(open('Enron-data/pos.pkl', 'rb'))

In [None]:
# initialize a directed graph for the entire year
G_total = nx.DiGraph()

# add nodes (employees) to the graph
G_total.add_nodes_from(employees)

# add edges (email exchanges) with frequency as edge weight for the entire year
for index, row in enron.iterrows():
    if row['From'] in employees and row['To'] in employees:
        if G_total.has_edge(row['From'], row['To']):
            G_total[row['From']][row['To']]['weight'] += 1/12
        else:
            G_total.add_edge(row['From'], row['To'], weight=1/12)


In [None]:
plot_year(G_total, pos, 'Enron Email Network 2001', save=True)

In [None]:
G_total.number_of_nodes(), G_total.number_of_edges()

In [None]:
G_months = []
A_months = []

for month in range(1, 13):
    G_month = nx.DiGraph()
    G_month.add_nodes_from(G_total.nodes())
    enron_month = enron[enron['Date'].dt.month == month]
    for index, row in enron_month.iterrows():
        if row['From'] in G_total.nodes() and row['To'] in G_total.nodes():
            if G_month.has_edge(row['From'], row['To']):
                G_month[row['From']][row['To']]['weight'] += 1
            else:
                G_month.add_edge(row['From'], row['To'], weight=1)
    G_months.append(G_month)
    A_months.append(nx.to_numpy_array(G_month, nodelist=employees))

In [None]:
plot_months(G_months, pos, 'Months_raw', save=True)

In [None]:
A_total = nx.to_numpy_array(G_total, nodelist=employees)

np.save('Enron-data/results/A_total.npy', A_total)

with open('Enron-data/results/A_months_raw.pkl', 'wb') as f:
    pickle.dump(A_months, f)

In [None]:
a_months = [A_month.flatten() for A_month in A_months]
covariance_matrix = np.cov(np.array(a_months).T)

In [None]:
A_months_filtered = []
G_months_filtered = []

num_days_each_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

for month in range(1, 13):

    print(f"Processing month {month}...")

    G_month = G_months[month-1]
    A_month = A_months[month-1]

    # Between-month noise covariance: estimated from between-month data.
    covariance_matrix_month = covariance_matrix

    # Filter
    A_month_filtered = filter_small_cov(A_month, covariance_matrix_month, epsilon = 1e-6, onlypos=True, noSL=True)
    A_months_filtered.append(A_month_filtered)

    G_month_filtered = nx.from_numpy_array(A_month_filtered, create_using=nx.DiGraph)
    G_month_filtered = nx.relabel_nodes(G_month_filtered, dict(enumerate(employees)))
    G_months_filtered.append(G_month_filtered)

In [None]:
# plot G_months_filtered
plot_year(G_month_filtered, pos, 'Months_filtered', save=False)

In [None]:
MSE(A_total, A_month_filtered)

In [None]:
plot_months(G_months_filtered, pos, 'Months_betw_new', save=True)

In [None]:
with open('Enron-data/results/A_months_betw_new.pkl', 'wb') as f:
    pickle.dump(A_months_filtered, f)

In [None]:
A_months_filtered = []
G_months_filtered = []

num_days_each_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

for month in range(1, 13):

    print(f"Processing month {month}...")

    G_month = G_months[month-1]
    A_month = A_months[month-1]

    # Between-month noise covariance: using a naive diagonal ansatz.
    covariance_matrix_month = np.diag([np.var(A_month) for _ in range(len(A_month.flatten()))])

    # Filter
    A_month_filtered = filter_small_cov(A_month, covariance_matrix_month, epsilon = 1e-6, onlypos=True, noSL=True, renorm=True)
    A_months_filtered.append(A_month_filtered)

    G_month_filtered = nx.from_numpy_array(A_month_filtered, create_using=nx.DiGraph)
    G_month_filtered = nx.relabel_nodes(G_month_filtered, dict(enumerate(employees)))
    G_months_filtered.append(G_month_filtered)

In [None]:
plot_months(G_months_filtered, pos, 'Months_diag_new', save=True)

In [None]:
with open('Enron-data/results/A_months_diag_new.pkl', 'wb') as f:
    pickle.dump(A_months_filtered, f)