# Bonus

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

To start, we want to gather the sender and receiver of all emails.

Let's start by loading the email file. We only need the EmailId and the SenderPersonId

In [None]:
emails = pd.read_csv('hillary-clinton-emails/Emails.csv', usecols=['Id', 'SenderPersonId'])
emails.rename(columns={'Id':'EmailId'}, inplace=True)
emails.head()

Now let's load the receivers of the messages

In [None]:
receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv', index_col=0)
receivers.rename(columns={'PersonId':'ReceiverPersonId'}, inplace=True)
receivers.head()

Now that we loaded both files, we can join them on the EmailId to get the sender and the receiver.

In [None]:
data = pd.merge(receivers, emails, on='EmailId').dropna()

print(len(data))
data.head(5)

In [None]:
edges = data[['ReceiverPersonId', 'SenderPersonId']]
edges.head()

In [None]:
G = nx.from_pandas_dataframe(edges, 'ReceiverPersonId', 'SenderPersonId')
# We use the degree of the nodes to set their size
d = nx.degree(G)
node_size = [(v + 2) * 8 for v in d.values()]

fig = plt.figure(figsize=(15, 13))
nx.draw(G, nodelist=d.keys(), node_size=node_size, width=0.5, node_color='#2b8ceb', alpha=0.8)
plt.show()

## Communities
Now we want to see if there are communities in the graph.

In [None]:
import community
part = community.best_partition(G)
values = [part.get(node) for node in G.nodes()]

Let's plot the graph using the computed communities.

In [None]:
fig = plt.figure(figsize=(15, 13))


nx.draw_spring(G, cmap=plt.get_cmap('Set1'), node_color = values, node_size=node_size, alpha=0.8, width=0.5, with_labels=False)
plt.show()

## Top 20 words by communities

In [None]:
# We create a dataframe from the communities computed above.
groups = pd.DataFrame.from_dict(part, orient='index')
groups = groups.reset_index()
groups.rename(columns={0: 'group', 'index': 'PersonId'}, inplace=True)
groups.head()

In [None]:
emails = pd.read_csv('hillary-clinton-emails/Emails.csv', usecols=['Id', 'SenderPersonId', 'RawText']).dropna()
emails.rename(columns={'SenderPersonId': 'PersonId'}, inplace=True)
data = pd.merge(emails, groups, on='PersonId').dropna()

In [None]:
grouped_emails = data.groupby('group').apply(lambda x: "%s" % ' '.join(x['RawText']).replace('\n', ' ')).to_frame()
grouped_emails.rename(columns={0: 'emails'}, inplace=True)
grouped_emails.head()

In [None]:
from nltk.corpus import stopwords

def word_count(txt, n):
    stop_words = stopwords.words("english")
    word_list = txt.split()
    count = {}
    for word in word_list:
        if word not in stop_words:
            if word in count:
                count[word] = count[word] + 1
            else:
                count[word] = 0
    return sorted(count, key=count.get, reverse=True)[:n]

        
grouped_emails['top_words'] = grouped_emails['emails'].apply(lambda emails: word_count(emails, 20))
grouped_emails