# Downloading the Dataset

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d wcukierski/enron-email-dataset

Downloading enron-email-dataset.zip to /content
 96% 342M/358M [00:04<00:00, 97.1MB/s]
100% 358M/358M [00:04<00:00, 90.8MB/s]


In [None]:
!unzip /content/enron-email-dataset.zip -d .

Archive:  /content/enron-email-dataset.zip
  inflating: ./emails.csv            


In [None]:
!rm -rf enron-email-dataset.zip kaggle.json

In [None]:
!ls

emails.csv  sample_data


# Preprocessing the data



In [None]:
import pandas as pd
import email

In [None]:
dataset = pd.read_csv('emails.csv')

In [None]:
email.message_from_string(dataset.loc[15436]['message']).items()

[('Message-ID', '<11146116.1075840320069.JavaMail.evans@thyme>'),
 ('Date', 'Mon, 4 Feb 2002 08:49:58 -0800 (PST)'),
 ('From', 'laura.vuittonet@enron.com'),
 ('To',
  'barry.tycholiz@enron.com, stephanie.miller@enron.com, \n\thouston <.ward@enron.com>, eric.bass@enron.com, \n\tdavid.baumbach@enron.com, gary.bryan@enron.com, jd.buss@enron.com, \n\tmorris.larubbio@enron.com, a..martin@enron.com, \n\tmichael.olsen@enron.com, brian.redmond@enron.com, \n\tcharles.weldon@enron.com'),
 ('Subject', 'FW: Fundamentals Presentation'),
 ('Mime-Version', '1.0'),
 ('Content-Type', 'text/plain; charset=us-ascii'),
 ('Content-Transfer-Encoding', '7bit'),
 ('X-From', 'Vuittonet, Laura </O=ENRON/OU=NA/CN=RECIPIENTS/CN=LVUITTON>'),
 ('X-To',
  'Tycholiz, Barry </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Btychol>, Miller, Stephanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Smiller2>, Ward, Kim S (Houston) </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Kward>, Bass, Eric </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Ebass>, Baumbach, David </O=ENRON/OU

In [None]:
email.message_from_string(dataset.loc[0]['message']).get_payload()

'Here is our forecast\n\n '

In [None]:
def get_field(field, messages):
    column = []
    for message in messages:
        e = email.message_from_string(message)
        column.append(e.get(field))
    return column

In [None]:
def get_payload(messages):
    content_message = []
    for message in messages:
        content_message.append(email.message_from_string(message).get_payload())
    return content_message

In [None]:
df = pd.DataFrame(columns=['from','to','cc','bcc','date','subject','message'])

df["from"] = get_field("From", dataset["message"])
df["to"] = get_field("To", dataset["message"])
df["cc"] = get_field("X-cc", dataset["message"])
df["bcc"] = get_field("X-bcc", dataset["message"])
df["date"] = get_field("Date", dataset["message"])
df["subject"] = get_field("Subject", dataset["message"])
df["message"] = get_payload(dataset["message"])

In [None]:
df

Unnamed: 0,from,to,cc,bcc,date,subject,message
0,phillip.allen@enron.com,tim.belden@enron.com,,,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",,Here is our forecast\n\n
1,phillip.allen@enron.com,john.lavorato@enron.com,,,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Re:,Traveling to have a business meeting takes the...
2,phillip.allen@enron.com,leah.arsdall@enron.com,,,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Re: test,test successful. way to go!!!
3,phillip.allen@enron.com,randall.gay@enron.com,,,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",,"Randy,\n\n Can you send me a schedule of the s..."
4,phillip.allen@enron.com,greg.piper@enron.com,,,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Re: Hello,Let's shoot for Tuesday at 11:45.
...,...,...,...,...,...,...,...
517396,john.zufferli@enron.com,kori.loibl@enron.com,,,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",Trade with John Lavorato,This is a trade with OIL-SPEC-HEDGE-NG (John L...
517397,john.zufferli@enron.com,john.lavorato@enron.com,,,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",Gas Hedges,Some of my position is with the Alberta Term b...
517398,john.zufferli@enron.com,dawn.doucet@enron.com,,,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",RE: CONFIDENTIAL,2\n\n -----Original Message-----\nFrom: \tDouc...
517399,john.zufferli@enron.com,jeanie.slone@enron.com,,,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",Calgary Analyst/Associate,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...


# Email Summarization

In [None]:
%pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/c4/87/49dc49e13ac107ce912c2f3f3fd92252c6d4221e88d1e6c16747044a11d8/sentence-transformers-1.1.0.tar.gz (78kB)
[K     |████████████████████████████████| 81kB 6.5MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 13.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 38.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

HBox(children=(FloatProgress(value=0.0, max=305584576.0), HTML(value='')))




In [None]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [None]:
sentence_embeddings

array([[ 0.19731508,  0.1210537 , -0.2581328 , ...,  0.16053145,
         0.10602241,  0.55408746],
       [-0.257479  ,  0.2462943 ,  0.0946965 , ..., -0.14210276,
         0.14410567,  0.51566   ],
       [ 0.05094927,  0.24235828,  0.15774007, ..., -0.20625304,
        -0.08491805,  0.06364136]], dtype=float32)

In [None]:
%pip install lexrank nltk

Collecting lexrank
[?25l  Downloading https://files.pythonhosted.org/packages/e1/25/f139d8526e014b6bf6021305492cd7ccffbfa10999802fce4813808b04e4/lexrank-0.1.0-py3-none-any.whl (69kB)
[K     |████████████████████████████████| 71kB 6.2MB/s  eta 0:00:01
Collecting path.py>=10.5
  Downloading https://files.pythonhosted.org/packages/8f/04/130b7a538c25693c85c4dee7e25d126ebf5511b1eb7320e64906687b159e/path.py-12.5.0-py3-none-any.whl
Collecting urlextract>=0.7
  Downloading https://files.pythonhosted.org/packages/c3/24/0f5c690a4ef9b5d30845517ef14c35ce6a3d96e5b0ae0db6895bb194ab10/urlextract-1.2.0-py3-none-any.whl
Collecting path
  Downloading https://files.pythonhosted.org/packages/d3/2a/b0f97e1b736725f6ec48a8bd564ee1d1f3f945bb5d39cb44ef8bbe66bd14/path-15.1.2-py3-none-any.whl
Collecting uritools
  Downloading https://files.pythonhosted.org/packages/3d/cf/b081118f4505e3092bfaad94d14b78ec8344976cea142ed767b240cbf243/uritools-3.0.1-py3-none-any.whl
Installing collected packages: path, path.py, ur

In [None]:
import numpy as np
from scipy.sparse.csgraph import connected_components

def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (
        threshold is None
        or isinstance(threshold, float)
        and 0 <= threshold < 1
    ):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )
    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)
    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )
    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )
    return scores


def _power_method(transition_matrix, increase_power=True):
    eigenvector = np.ones(len(transition_matrix))
    if len(eigenvector) == 1:
        return eigenvector
    transition = transition_matrix.transpose()
    while True:
        eigenvector_next = np.dot(transition, eigenvector)
        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next
        eigenvector = eigenvector_next
        if increase_power:
            transition = np.dot(transition, transition)

def connected_nodes(matrix):
    _, labels = connected_components(matrix)
    groups = []
    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)
    return groups


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')
    row_sum = weights_matrix.sum(axis=1, keepdims=True)
    return weights_matrix / row_sum

def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1
    return create_markov_matrix(discrete_weights_matrix)


def graph_nodes_clusters(transition_matrix, increase_power=True):
    clusters = connected_nodes(transition_matrix)
    clusters.sort(key=len, reverse=True)
    centroid_scores = []
    for group in clusters:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        centroid_scores.append(eigenvector / len(group))
    return clusters, centroid_scores


def stationary_distribution(
    transition_matrix,
    increase_power=True,
    normalized=True,
):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')
    distribution = np.zeros(n_1)
    grouped_indices = connected_nodes(transition_matrix)
    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector
    if normalized:
        distribution /= n_1
    return distribution

In [None]:
from torch import Tensor, device
import torch 

def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)
    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)
    if len(a.shape) == 1:
        a = a.unsqueeze(0)
    if len(b.shape) == 1:
        b = b.unsqueeze(0)
    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [None]:
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np

nltk.download('punkt')
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

document = """
New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.
Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world.
New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.
Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
"""

sentences = nltk.sent_tokenize(document)
print("Num sentences:", len(sentences))
embeddings = model.encode(sentences, convert_to_tensor=True)
cos_scores = cos_sim(embeddings, embeddings).numpy()
centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
most_central_sentence_indices = np.argsort(-centrality_scores)
print("\n\nSummary:")
for idx in most_central_sentence_indices[0:5]:
    print(sentences[idx].strip())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Num sentences: 29


Summary:
Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.
New York City (NYC), often called simply New York, is the most populous city in the United States.
Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports.
If the New York metropolitan area were a so

In [None]:
messages = []
for index, row in df.iterrows():
  messages.append(row['message'])

In [None]:
messages = sorted(messages, key=len)

In [None]:
messages = list(reversed(messages))

In [None]:
summaries = []
for x in range(100,101):
    message = messages[x]
    top_5 = []
    sentences = nltk.sent_tokenize(message)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    cos_scores = cos_sim(embeddings, embeddings).numpy()
    centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
    most_central_sentence_indices = np.argsort(-centrality_scores)
    print(len(summaries))
    for idx in most_central_sentence_indices[0:5]:
        top_5.append(sentences[idx].strip())
    summaries.append([message, top_5])

0


In [None]:
summaries[0]

 ['The curre=\nnt=20\nfutures market price for California-delivered power in July and August is=\n=20\nabout $500 per megawatt-hour, but administration officials insist they have=\n=20\ncontracted for much of the summer peak load at lower costs, leaving less=20\nexposure to the spot market.=20\nIf it all works as Davis hopes, customers of the three major utilities --=\n=20\nabout 70 percent of Californians -- will see a sharp boost in their rates=\n=20\nsoon, and that will be enough to finance the $20 billion in power purchase=\n=20\ndebts incurred by the utilities and the state so far, plus pay for future=\n=20\npurchases.=20\nThe bonds would pick up the costs not covered by the raised rates in the=20\nearly years of the scheme, then be paid off later as rates remain high but=\n=20\npower costs go down.',
  '"=\n=20\nBut energy producers challenged the bill, saying it would simply discourage=\n=20\ncompanies from building plants in California or from upgrading existing=20\nfacilities.

In [None]:
summaries_df = pd.DataFrame(summaries, columns = ['content', 'summary']) 
summaries_df

Unnamed: 0,content,summary
0,"Please see the following articles:\n\nSac Bee,...",[The curre=\nnt=20\nfutures market price for C...


# Community Detection

In [None]:
import networkx as nx
import community as community_louvain
G = nx.Graph()

In [None]:
import re
match = re.findall(r'[\w\.-]+@[\w\.-]+', 'this is a test string this is to see blah blah blah ... nive@iiit.com ... nive123@iiit.com')
match

['nive@iiit.com', 'nive123@iiit.com']

In [None]:
def email_list(search_string):
    if not search_string:
      return []
    match = re.findall(r'[\w\.-]+@[\w\.-]+', search_string)
    return match

In [None]:
df['to'] = df['to'].apply(email_list)

In [None]:
df['from'] = df['from'].apply(email_list)

In [None]:
list_all_emails = list(df['from']) + list(df['to'])

In [None]:
import itertools
merged = list(itertools.chain(*list_all_emails))

In [None]:
len(list(set(merged)))

78558

In [None]:
G.add_nodes_from(merged)

In [None]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 78558\nNumber of edges: 0\nAverage degree:   0.0000'

In [None]:
df['from']

0         [phillip.allen@enron.com]
1         [phillip.allen@enron.com]
2         [phillip.allen@enron.com]
3         [phillip.allen@enron.com]
4         [phillip.allen@enron.com]
                    ...            
517396    [john.zufferli@enron.com]
517397    [john.zufferli@enron.com]
517398    [john.zufferli@enron.com]
517399    [john.zufferli@enron.com]
517400    [john.zufferli@enron.com]
Name: from, Length: 517401, dtype: object

In [None]:
df['to']

0               [tim.belden@enron.com]
1            [john.lavorato@enron.com]
2             [leah.arsdall@enron.com]
3              [randall.gay@enron.com]
4               [greg.piper@enron.com]
                      ...             
517396          [kori.loibl@enron.com]
517397       [john.lavorato@enron.com]
517398         [dawn.doucet@enron.com]
517399        [jeanie.slone@enron.com]
517400    [livia_zufferli@monitor.com]
Name: to, Length: 517401, dtype: object

In [None]:
list_edges = []
for x,y in zip(df['from'], df['to']):
    for i in y:
        try:
            list_edges.append([x[0],i])
        except:
            print(x,i)

[] skilling@enron.com
[] jeff.skilling@enron.com


In [None]:
len(list_edges)

3129827

In [None]:
list_of_tuples = list(set(tuple(row) for row in list_edges))

In [None]:
len(list_of_tuples)

310679

In [None]:
G.add_edges_from(list_of_tuples)

In [None]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 78558\nNumber of edges: 288064\nAverage degree:   7.3338'

In [None]:
partition = community_louvain.best_partition(G, random_state = 0)

In [None]:
comm_list = pd.DataFrame(partition.items())

In [None]:
comm_list.columns = ['email_id', 'community_number']

In [None]:
comm_list

Unnamed: 0,email_id,community_number
0,phillip.allen@enron.com,0
1,ina.rangel@enron.com,0
2,1.11913372.-2@multexinvestornetwork.com,1
3,messenger@ecm.bloomberg.com,2
4,aod@newsdata.com,3
...,...,...
78553,admin@johnscotti.com,0
78554,frank.senders@pandora.be,0
78555,karen.mazuryk@powerpool.ab.ca,0
78556,astrid.montes@powerpool.ab.ca,0


In [None]:
grouped = comm_list.groupby(['community_number'])['email_id'].apply(list)

In [None]:
grouped

community_number
0       [phillip.allen@enron.com, ina.rangel@enron.com...
1       [1.11913372.-2@multexinvestornetwork.com, webm...
2       [messenger@ecm.bloomberg.com, rebecca.cantrell...
3       [aod@newsdata.com, articles-email@ms1.lga2.nyt...
4       [market-reply@listserv.dowjones.com, market_al...
                              ...                        
1924    [cathylendel@canada.com, bin459f@mailman.enron...
1925                             [marissa345@hotmail.com]
1926    [energy-i@petroleumargus.com, freetrial@petrol...
1927                           [gmburres@purvingertz.com]
1928                           [cmaliaga@purvingertz.com]
Name: email_id, Length: 1929, dtype: object