In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import email
from nltk.tokenize.regexp import RegexpTokenizer
import itertools
import re
from nltk.corpus import stopwords
import string
import nltk
from sklearn.metrics.pairwise import linear_kernel

In [45]:
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append(part.get_payload())
    return ''.join(parts)


def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs


In [46]:
emails_df = pd.read_csv('../../Data/enron_emails.csv')

In [47]:
emails_df_sub = emails_df.sample(5000, random_state=1).reset_index(drop=True)

In [48]:
messages = list(map(email.message_from_string, emails_df_sub['message']))
emails_df_sub.drop('message', axis=1, inplace=True)

In [49]:
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df_sub[key] = [doc[key] for doc in messages]

In [50]:
# Parse content from emails
emails_df_sub['content'] = list(map(get_text_from_email, messages))

In [51]:
# Split multiple email addresses
emails_df_sub['From'] = emails_df_sub['From'].map(split_email_addresses)
emails_df_sub['To'] = emails_df_sub['To'].map(split_email_addresses)

In [52]:
# Extract the root of 'file' as 'user'
emails_df_sub['user'] = emails_df_sub['file'].map(lambda x:x.split('/')[0])
del messages

In [53]:
emails_df_sub_grouped = emails_df_sub.groupby('user')['content'].apply(lambda x: x.sum()).reset_index().groupby('user')['content'].apply(lambda x: x.sum()).reset_index()

In [54]:
emails_df_sub_grouped.head()

Unnamed: 0,user,content
0,allen-p,If you wish to unsubscribe please CLICK HERE: ...
1,arnold-j,we have 4 do you want them?\n\n\n\n\nErrol ...
2,arora-h,This request has been pending your approval fo...
3,badeer-r,"Last Thursday, I made the first attached prese..."
4,bailey-s,"Edward,\n\nI thought I noticed a Senior design..."


In [55]:
stop = stopwords.words('english')
exclude = set(string.punctuation)
exclude.update({'\n', '\t', '75pt', 'font size','padding', '0pt', 'td', 'font'})
chat_words_lower = sorted(set(w.lower() for w in nltk.corpus.nps_chat.words()))

In [56]:
def clean(doc):
    try:
        doc['content'] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(doc['content']))
        doc['content'] = re.sub(r'http\S+', '', doc['content'])
        doc['content'] = ''.join([i for i in doc['content'] if not i.isdigit()])
        doc['content'] = ' '.join([i for i in doc['content'].lower().split(' ') if i not in stop])
        doc['content'] = ' '.join([i for i in doc['content'].lower().split(' ') if i not in chat_words_lower])
        doc['content'] = ''.join(ch for ch in doc['content'] if ch not in exclude)
    except:
        doc['content'] = ''
    return doc['content']

In [57]:
emails_df_sub_grouped['content']=emails_df_sub_grouped.apply(clean, axis=1)

In [58]:
emails_df_sub_grouped.head()

Unnamed: 0,user,content
0,allen-p,unsubscribe click here if received error reply...
1,arnold-j,themerrol mclaughlinenron amto arnoldhouectect...
2,arora-h,request pending approval days click review upo...
3,badeer-r,thursday attached presentation ferc staff the ...
4,bailey-s,edwardi noticed senior designation strange yes...


In [60]:
from textblob import TextBlob as tb

emails_df_sub_grouped['featured'] = emails_df_sub_grouped['content'].map(lambda each: (tb(str(each)).sentiment.polarity))

In [61]:
emails_df_sub_grouped.head()

Unnamed: 0,user,content,featured
0,allen-p,unsubscribe click here if received error reply...,0.045169
1,arnold-j,themerrol mclaughlinenron amto arnoldhouectect...,0.029053
2,arora-h,request pending approval days click review upo...,0.0
3,badeer-r,thursday attached presentation ferc staff the ...,0.050794
4,bailey-s,edwardi noticed senior designation strange yes...,0.196429


In [62]:
from wordcloud import WordCloud



plt.figure(figsize=(18,12))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(' '.join(emails_df_sub_grouped['content']))
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for content", fontsize=30)
plt.axis("off")
plt.show()

ModuleNotFoundError: No module named 'wordcloud'