In [1]:
# pandas & numpy:
import numpy as np
import pandas as pd

# visualization:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px

# tokenization:
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# gensim:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# scikit-learn:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix


In [2]:
df=pd.read_csv('Data/emails_cleaned.csv')
pd.set_option('display.max_columns', None)
df.head()


Unnamed: 0,file,message,content,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,has_other_content,if_forwarded
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast\n\n,<18782981.1075855378110.JavaMail.evans@thyme,"Mon, 14 May 2001 16:39:00 -0700 (PDT",phillip.allen@enron.co,tim.belden@enron.co,,,1.0,text/plain; charset=us-asci,7bi,,Phillip K Alle,Tim Belden <Tim Belden/Enron@EnronXGate,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,"Fri, 4 May 2001 13:51:00 -0700 (PDT",phillip.allen@enron.co,john.lavorato@enron.co,Re,,1.0,text/plain; charset=us-asci,7bi,,Phillip K Alle,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-,pallen (Non-Privileged).pst,False,False
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT",phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes,,1.0,text/plain; charset=us-asci,7bi,,Phillip K Alle,Leah Van Arsdal,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...",<13505866.1075863688222.JavaMail.evans@thyme,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT",phillip.allen@enron.co,randall.gay@enron.co,,,1.0,text/plain; charset=us-asci,7bi,,Phillip K Alle,Randall L Ga,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT",phillip.allen@enron.co,greg.piper@enron.co,Re: Hell,,1.0,text/plain; charset=us-asci,7bi,,Phillip K Alle,Greg Pipe,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mai,Allen-,pallen.nsf,False,False


In [3]:
def clean_data(df):
    
    df=df.drop(columns=['file', 'message', 'Cc', 'Mime-Version', 'Content-Type','Content-Transfer-Encoding','Date',
                        'Bcc', 'X-From', 'X-To', 'X-cc', 'X-bcc','X-Folder', 'X-Origin', 'X-FileName', 'has_other_content', 'if_forwarded'])
    df.columns=[e.lower().replace(' ', '_') for e in df.columns]
    
    return df

In [4]:
df=clean_data(df)
df

Unnamed: 0,content,message-id,from,to,subject
0,Here is our forecast\n\n,<18782981.1075855378110.JavaMail.evans@thyme,phillip.allen@enron.co,tim.belden@enron.co,
1,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,phillip.allen@enron.co,john.lavorato@enron.co,Re
2,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes
3,"Randy,\n\n Can you send me a schedule of the s...",<13505866.1075863688222.JavaMail.evans@thyme,phillip.allen@enron.co,randall.gay@enron.co,
4,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,phillip.allen@enron.co,greg.piper@enron.co,Re: Hell
...,...,...,...,...,...
517392,This is a trade with OIL-SPEC-HEDGE-NG (John L...,<26807948.1075842029936.JavaMail.evans@thyme,john.zufferli@enron.co,kori.loibl@enron.co,Trade with John Lavorat
517393,Some of my position is with the Alberta Term b...,<25835861.1075842029959.JavaMail.evans@thyme,john.zufferli@enron.co,john.lavorato@enron.co,Gas Hedge
517394,2\n\n -----Original Message-----\nFrom: \tDouc...,<28979867.1075842029988.JavaMail.evans@thyme,john.zufferli@enron.co,dawn.doucet@enron.co,RE: CONFIDENTIA
517395,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,<22052556.1075842030013.JavaMail.evans@thyme,john.zufferli@enron.co,jeanie.slone@enron.co,Calgary Analyst/Associat


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517397 entries, 0 to 517396
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   content     491133 non-null  object
 1   message-id  517397 non-null  object
 2   from        517397 non-null  object
 3   to          495552 non-null  object
 4   subject     498118 non-null  object
dtypes: object(5)
memory usage: 19.7+ MB


In [6]:
df.isnull().sum()

content       26264
message-id        0
from              0
to            21845
subject       19279
dtype: int64

In [7]:
df=df.dropna().reset_index()
df

Unnamed: 0,index,content,message-id,from,to,subject
0,1,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,phillip.allen@enron.co,john.lavorato@enron.co,Re
1,2,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes
2,4,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,phillip.allen@enron.co,greg.piper@enron.co,Re: Hell
3,5,"Greg,\n\n How about either next Tuesday or Thu...",<30965995.1075863688265.JavaMail.evans@thyme,phillip.allen@enron.co,greg.piper@enron.co,Re: Hell
4,7,any morning between 10 and 11:30,<17189699.1075863688308.JavaMail.evans@thyme,phillip.allen@enron.co,joyce.teixeira@enron.co,Re: PRC review - phone call
...,...,...,...,...,...,...
451600,517392,This is a trade with OIL-SPEC-HEDGE-NG (John L...,<26807948.1075842029936.JavaMail.evans@thyme,john.zufferli@enron.co,kori.loibl@enron.co,Trade with John Lavorat
451601,517393,Some of my position is with the Alberta Term b...,<25835861.1075842029959.JavaMail.evans@thyme,john.zufferli@enron.co,john.lavorato@enron.co,Gas Hedge
451602,517394,2\n\n -----Original Message-----\nFrom: \tDouc...,<28979867.1075842029988.JavaMail.evans@thyme,john.zufferli@enron.co,dawn.doucet@enron.co,RE: CONFIDENTIA
451603,517395,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,<22052556.1075842030013.JavaMail.evans@thyme,john.zufferli@enron.co,jeanie.slone@enron.co,Calgary Analyst/Associat


In [8]:
#we want to join all text data in one single column.We will create a new column called "completed_text"
df["completed_text"]=df["subject"]+df["content"]

In [9]:
df

Unnamed: 0,index,content,message-id,from,to,subject,completed_text
0,1,Traveling to have a business meeting takes the...,<15464986.1075855378456.JavaMail.evans@thyme,phillip.allen@enron.co,john.lavorato@enron.co,Re,ReTraveling to have a business meeting takes t...
1,2,test successful. way to go!!!,<24216240.1075855687451.JavaMail.evans@thyme,phillip.allen@enron.co,leah.arsdall@enron.co,Re: tes,Re: testest successful. way to go!!!
2,4,Let's shoot for Tuesday at 11:45.,<30922949.1075863688243.JavaMail.evans@thyme,phillip.allen@enron.co,greg.piper@enron.co,Re: Hell,Re: HellLet's shoot for Tuesday at 11:45.
3,5,"Greg,\n\n How about either next Tuesday or Thu...",<30965995.1075863688265.JavaMail.evans@thyme,phillip.allen@enron.co,greg.piper@enron.co,Re: Hell,"Re: HellGreg,\n\n How about either next Tuesda..."
4,7,any morning between 10 and 11:30,<17189699.1075863688308.JavaMail.evans@thyme,phillip.allen@enron.co,joyce.teixeira@enron.co,Re: PRC review - phone call,Re: PRC review - phone callany morning between...
...,...,...,...,...,...,...,...
451600,517392,This is a trade with OIL-SPEC-HEDGE-NG (John L...,<26807948.1075842029936.JavaMail.evans@thyme,john.zufferli@enron.co,kori.loibl@enron.co,Trade with John Lavorat,Trade with John LavoratThis is a trade with OI...
451601,517393,Some of my position is with the Alberta Term b...,<25835861.1075842029959.JavaMail.evans@thyme,john.zufferli@enron.co,john.lavorato@enron.co,Gas Hedge,Gas HedgeSome of my position is with the Alber...
451602,517394,2\n\n -----Original Message-----\nFrom: \tDouc...,<28979867.1075842029988.JavaMail.evans@thyme,john.zufferli@enron.co,dawn.doucet@enron.co,RE: CONFIDENTIA,RE: CONFIDENTIA2\n\n -----Original Message----...
451603,517395,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,<22052556.1075842030013.JavaMail.evans@thyme,john.zufferli@enron.co,jeanie.slone@enron.co,Calgary Analyst/Associat,Calgary Analyst/AssociatAnalyst\t\t\t\t\tRank\...


In [10]:
df['completed_text'][0]

"ReTraveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  \n\nMy suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.\n"