# Exploratory Data Analysis of the Enron Dataset

## Objectives of this notebook excercise:
#### 1) Read the dataset provided in the link http://www2.aueb.gr/users/ion/data/enron-spam/
#### 2) Perform a basic counts of spam and ham emails provided and visulaise how the data is provided
#### 3) Perform simple NLP processes to build a tf-idf statisitics
#### 4) Visualize the results of the NLP process
#### 5) Explore some graphs that can be built with the data

In [1]:
# Importing the required libraries and using python 2.7
import glob
import os
import string
from collections import Counter

try:
    import pandas as pd
except:
    !sudo pip install pandas
    import pandas as pd
import numpy as np
try:
    from nltk.corpus import stopwords
except:
    !sudo pip install -U nltk
    from nltk.corpus import stopwords

try:
    import matplotlib
except:
    !sudo pip install matplotlib
    import matplotlib

try:
    matplotlib.use("TkAgg")
except:
    matplotlib.use("agg")
import matplotlib.pyplot as plt

# Changing the directory level to the prject name
if("Code" in os.getcwd()):
    os.chdir("..")
print(os.getcwd())

/Users/gouravsanjuktabhabesh/pycharmProjects/freelance_work_1


#### Objective 1: Reading the dataset from the data downloaded from the provided link

In [2]:
# Reading the data set enron1 to enron6 from the Res folder
enron_folders = ["Res/enron"+str(x) for x in range(1,7)]
enron_spam_folders = [x + "/spam/" for x in enron_folders]
enron_ham_folders = [x + "/ham/" for x in enron_folders]

# Listing all the file names respectively with the type of mail
spam_mails = []
ham_mails = []
for i in range(len(enron_folders)):
    spam_mails.extend(glob.glob(enron_spam_folders[i] + "*.txt"))
    ham_mails.extend(glob.glob(enron_ham_folders[i] + "*.txt"))

# Stats for different types of mails in our data set
print("Number of spam mails in the Enron dataset provided = %s" % str(len(spam_mails)))
print("Number of ham mails in the Enron dataset provided = %s" % str(len(ham_mails)))

Number of spam mails in the Enron dataset provided = 17171
Number of ham mails in the Enron dataset provided = 16545


#### Objective 2: Building a basic statistic of the labelled email data provided 

In [3]:
# Function to extract various meta info from the row passed
# Adding following new meta columns:
# 1) id : some identity of the mail 
# 2) dataset : name of the enron datasets (enron 1 to enron 6)
# 3) date_mail : date on which mail was received
# 4) owner_mail : owner of the mail
# 5) content_message : contents of the mail
# 6) suject_message : subject of the mail
def extract_date_owner_content(row):
    val = row["file_name"].split(".")
    row["id"] = val[0].split("/")[-1]
    row["dataset"] = val[0].split("/")[1]
    row["date_mail"] = val[1]
    row["owner_mail"] = val[2]
    f = open(row["file_name"],"r")
    row["content_message"] = f.read()
    row["subject_message"] = str(row["content_message"]).split("\r\n")[0]
    return row

#### A) Spam Mails and its stats:

In [26]:
# Extracting the various meta info of the spam mails from the file name of the mail lists and construct a data frame
spam_metadata_df = pd.DataFrame(columns=["id","mail_label","file_name",
                                         "date_mail","owner_mail","content_message","subject_message"])
spam_metadata_df["file_name"] = spam_mails
spam_metadata_df["mail_label"] = "spam"
spam_metadata_df = spam_metadata_df.apply(lambda row: extract_date_owner_content(row), axis=1)
spam_metadata_df["date_mail"] = spam_metadata_df.apply(lambda x: pd.to_datetime(x["date_mail"], 
                                                                                format="%Y-%m-%d"),axis=1)
spam_metadata_df.head(5)

Unnamed: 0,id,mail_label,file_name,date_mail,owner_mail,content_message,subject_message,dataset
0,4743,spam,Res/enron1/spam/4743.2005-06-25.GP.spam.txt,2005-06-25,GP,"Subject: what up , , your cam babe\r\nwhat are...","Subject: what up , , your cam babe",enron1
1,1309,spam,Res/enron1/spam/1309.2004-06-08.GP.spam.txt,2004-06-08,GP,Subject: want to make more money ?\r\norder co...,Subject: want to make more money ?,enron1
2,726,spam,Res/enron1/spam/0726.2004-03-26.GP.spam.txt,2004-03-26,GP,Subject: food for thoughts\r\n[\r\njoin now - ...,Subject: food for thoughts,enron1
3,202,spam,Res/enron1/spam/0202.2004-01-13.GP.spam.txt,2004-01-13,GP,Subject: miningnews . net newsletter - tuesday...,Subject: miningnews . net newsletter - tuesday...,enron1
4,3988,spam,Res/enron1/spam/3988.2005-03-06.GP.spam.txt,2005-03-06,GP,Subject: your pharmacy ta\r\nwould you want ch...,Subject: your pharmacy ta,enron1


#### Visualize the counts of spam mails being received by various owners

In [27]:
print(pd.DataFrame(spam_metadata_df.groupby("owner_mail").count()["id"].reset_index()).set_index("owner_mail"))
spam_mail_owner_date_count_df = pd.DataFrame(spam_metadata_df\
                        .groupby(["owner_mail","date_mail"]).count()["id"].reset_index()).set_index("date_mail")

# Plotiing the curve for the Spam mails date line
plt.close()
fig_size=[12,9]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.figsize"]

plt.subplot(3,1,1)
plt.title("Owner -> BG")
plt.ylabel("count")
spam_mail_owner_date_count_df.loc[spam_mail_owner_date_count_df["owner_mail"]=="BG"]["id"].reset_index().plot()

plt.subplot(3,1,2)
plt.title("Owner -> GP")
plt.ylabel("count")
spam_mail_owner_date_count_df.loc[spam_mail_owner_date_count_df["owner_mail"]=="GP"]["id"].reset_index().plot()

plt.subplot(3,1,3)
plt.title("Owner -> SA_and_HP")
plt.ylabel("count")
spam_mail_owner_date_count_df.loc[spam_mail_owner_date_count_df["owner_mail"]=="SA_and_HP"]["id"].reset_index().plot()
plt.subplots_adjust(hspace=2.0)

plt.savefig("Res/plot_spam_mails_date_curve.jpg")
print("Plot done!")

              id
owner_mail      
BG          6000
GP          6000
SA_and_HP   5171


#### B) Ham Mails and its stats:

In [4]:
# Extracting the various meta info of the ham mails from the file name of the mail lists and construct a data frame
ham_metadata_df = pd.DataFrame(columns=["id","mail_label","file_name","date_mail","owner_mail",
                                        "content_message","subject_message"])
ham_metadata_df["file_name"] = ham_mails
ham_metadata_df["mail_label"] = "ham"
ham_metadata_df = ham_metadata_df.apply(lambda row: extract_date_owner_content(row), axis=1)
spam_metadata_df["date_mail"] = spam_metadata_df.apply(lambda x: pd.to_datetime(x["date_mail"],
                                                                                format="%Y-%m-%d"),axis=1)
ham_metadata_df.head(5)

NameError: name 'spam_metadata_df' is not defined

#### Visualize the counts of ham mails being received by various owners

In [32]:
print(pd.DataFrame(ham_metadata_df.groupby("owner_mail").count()["id"].reset_index()).set_index("owner_mail"))
ham_mail_owner_date_count_df = pd.DataFrame(ham_metadata_df\
                        .groupby(["owner_mail","date_mail"]).count()["id"].reset_index()).set_index("date_mail")

plt.close()
fig_size=[18,12]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.figsize"]
# Plotiing the curve for the Ham mails vs date line
plt.subplot(6,1,1)
plt.title("Owner -> farmer")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="farmer"]["id"].reset_index().plot()

plt.subplot(6,1,2)
plt.title("Owner -> beck")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="beck"]["id"].reset_index().plot()

plt.subplot(6,1,3)
plt.title("Owner -> kaminski")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="kaminski"]["id"].reset_index().plot()

plt.subplot(6,1,4)
plt.title("Owner -> kitchen")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="kitchen"]["id"].reset_index().plot()

plt.subplot(6,1,5)
plt.title("Owner -> lokay")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="lokay"]["id"].reset_index().plot()

plt.subplot(6,1,6)
plt.title("Owner -> williams")
plt.ylabel("count")
ham_mail_owner_date_count_df.loc[ham_mail_owner_date_count_df["owner_mail"]=="williams"]["id"].reset_index().plot()

plt.subplots_adjust(hspace=0.6)
plt.savefig("Res/plot_ham_mails_date_curve.jpg")
print("Plot done!")

              id
owner_mail      
beck        1500
farmer      3672
kaminski    4361
kitchen     4012
lokay       1500
williams    1500


## Working on feature building from the messages of the enron dataset mails
#### 1) Spam messages

In [33]:
# Counting the number of lines including the subject line
spam_metadata_df["line_count"] = spam_metadata_df.apply(lambda row: row["content_message"].count("\r\n"),axis=1)

# Extracting the tokens out of the messages in a column called "tokens"
# Removing the words "Subject:" from the mails as that is not the part of the mail
spam_metadata_df["tokens"] = spam_metadata_df.apply(lambda row: row["content_message"]\
    .replace("\\r\\n"," ").replace("."," . ").replace(","," , ").replace(";"," ; ")\
    .replace(":"," : ").replace("!"," ! ").replace("?"," ? ").replace("$"," $ ")\
    .replace("-"," - ").replace("="," = ").replace("&"," & ").replace("Subject :","")\
    .replace("/"," / ").split(),axis=1)

# Creating a column to count the total number of tokens
spam_metadata_df["token_count"] = spam_metadata_df.apply(lambda row: len(row["tokens"]),axis=1)

# Creating a column to count the number of punctuations
spam_metadata_df["punctuations_count"] = spam_metadata_df\
    .apply(lambda row: sum([True if x in string.punctuation else False for x in row["tokens"]]),axis=1)

# Single character count in the list of tokens
spam_metadata_df["single_char_count"] = spam_metadata_df\
    .apply(lambda row: sum([True if (len(x)==1 and x not in string.punctuation) else False for x in row["tokens"]]),axis=1)

# Column to count the tokens that are numbers 
spam_metadata_df["number_token_count"] = spam_metadata_df\
    .apply(lambda row: sum([True if x.isdigit() else False for x in row["tokens"]]),axis=1)

# Tokens that are years mentioned in between 1970 to 2050
spam_metadata_df["year_count"] = spam_metadata_df\
    .apply(lambda row: sum([True if (x >= "1970" and x <= "2050") else False for x in row["tokens"]]),axis=1)


In [34]:
spam_metadata_df[["line_count","tokens","token_count","punctuations_count","single_char_count",\
                  "number_token_count","year_count"]].head(5)

Unnamed: 0,line_count,tokens,token_count,punctuations_count,single_char_count,number_token_count,year_count
0,14,"[what, up, ,, ,, your, cam, babe, what, are, y...",184,45,4,0,0
1,9,"[want, to, make, more, money, ?, order, confir...",72,15,1,0,0
2,6,"[food, for, thoughts, [, join, now, -, take, a...",18,4,1,0,0
3,97,"[miningnews, ., net, newsletter, -, tuesday, ,...",1026,161,33,58,17
4,2,"[your, pharmacy, ta, would, you, want, cheap, ...",19,7,0,0,0


#### 2) Ham messages

In [35]:
# Counting the number of lines including the subject line
ham_metadata_df["line_count"] = ham_metadata_df.apply(lambda row: row["content_message"].count("\r\n"),axis=1)

# Extracting the tokens out of the messages in a column called "tokens"
ham_metadata_df["tokens"] = ham_metadata_df.apply(lambda row: row["content_message"]\
    .replace("\\r\\n"," ").replace("."," . ").replace(","," , ").replace(";"," ; ")\
    .replace(":"," : ").replace("!"," ! ").replace("?"," ? ").replace("$"," $ ")\
    .replace("-"," - ").replace("="," = ").replace("&"," & ").replace("/"," / ")\
    .replace("Subject :","").split(),axis=1)

# Creating a column to count the total number of tokens
ham_metadata_df["token_count"] = ham_metadata_df.apply(lambda row: len(row["tokens"]),axis=1)

# Creating a column to count the number of punctuations
ham_metadata_df["punctuations_count"] = ham_metadata_df\
    .apply(lambda row: sum([True if x in string.punctuation else False for x in row["tokens"]]),axis=1)

# Single character count in the list of tokens
ham_metadata_df["single_char_count"] = ham_metadata_df\
    .apply(lambda row: sum([True if (len(x)==1 and x not in string.punctuation) else False for x in row["tokens"]]),axis=1)

# Column to count the tokens that are numbers  
ham_metadata_df["number_token_count"] = ham_metadata_df\
    .apply(lambda row: sum([True if x.isdigit() else False for x in row["tokens"]]),axis=1)

# Tokens that are years mentioned in between 1970 to 2050
ham_metadata_df["year_count"] = ham_metadata_df\
    .apply(lambda row: sum([True if (x >= "1970" and x <= "2050") else False for x in row["tokens"]]),axis=1)


In [36]:
ham_metadata_df[["line_count","tokens","token_count","punctuations_count","single_char_count",\
                 "number_token_count","year_count"]].head(5)

Unnamed: 0,line_count,tokens,token_count,punctuations_count,single_char_count,number_token_count,year_count
0,45,"[ena, sales, on, hpl, just, to, update, you, o...",588,146,41,12,2
1,10,"[98, -, 6736, &, 98, -, 9638, for, 1997, (, ua...",144,37,13,38,1
2,2,"[hpl, nominations, for, december, 28, ,, 1999,...",22,7,0,4,1
3,14,"[revised, nom, -, kcs, resources, daren, ,, it...",159,80,4,17,2
4,30,"[new, production, -, sitara, deals, needed, da...",352,135,18,41,2


In [37]:
# Building a list of all spam and ham words
spam_word_list = []
for x in spam_metadata_df["tokens"]:
    spam_word_list.extend(x)

# Filtering the punctuations and numbers
spam_word_list = filter(lambda x: x.isdigit()==False and len(x)>3,spam_word_list)
    
ham_word_list = []
for x in ham_metadata_df["tokens"]:
    ham_word_list.extend(x)
# Filtering the punctuations and numbers
ham_word_list = filter(lambda x: x.isdigit()==False and len(x)>3,ham_word_list)

print("Spam word count = {}\nHam  word count = {}\nRatio of Spam to Ham word count = {}"\
      .format(len(spam_word_list),len(ham_word_list),len(spam_word_list)/float(len(ham_word_list))))

Spam word count = 2105555
Ham  word count = 2567675
Ratio of Spam to Ham word count = 0.820023951629


In [38]:
# Removing stop words and then finding the frequencies
stop_words = set(stopwords.words('english'))
spam_words_stop_filtered = []
for word in spam_word_list:
    if(word not in stop_words):
        spam_words_stop_filtered.append(word)
    
ham_words_stop_filtered = []
for word in ham_word_list:
    if(word not in stop_words):
        ham_words_stop_filtered.append(word)
        
print("Spam word count = {}\nHam  word count = {}\nRatio of Spam to Ham word count = {}"\
      .format(len(spam_words_stop_filtered),len(ham_words_stop_filtered),len(spam_words_stop_filtered)/float(len(ham_words_stop_filtered))))


Spam word count = 1819296
Ham  word count = 2214541
Ratio of Spam to Ham word count = 0.821522834754


In [39]:
# Threshold number of words to be accepted in the bag of words to be counted as significant
freq_tolerance_low = 100 # The number of word count that must occur for a word to be counted as significant
freq_tolerance_high = 4000 # The maximum number of word count to be considered significant
spam_words_freq = dict(Counter(spam_words_stop_filtered))
a1_sorted_keys = sorted(spam_words_freq, key=spam_words_freq.get, reverse=True)
spam_words_freq_dict_values = []
bag_of_words_spam = {}
for r in a1_sorted_keys:
    spam_words_freq_dict_values.append(spam_words_freq[r])
    if(spam_words_freq[r]>=freq_tolerance_low and spam_words_freq[r]<=freq_tolerance_high):
        bag_of_words_spam[r] = spam_words_freq[r]

In [40]:
# Threshold number of words to be accepted in the bag of words to be counted as significant
freq_tolerance_low = 100 # The number of word count that must occur for a word to be counted as significant
freq_tolerance_high = 4000 # The maximum number of word count to be considered significant
ham_words_freq = dict(Counter(ham_words_stop_filtered))
a1_sorted_keys = sorted(ham_words_freq, key=ham_words_freq.get, reverse=True)
ham_words_freq_dict_values = []
bag_of_words_ham = {}
for r in a1_sorted_keys:
    ham_words_freq_dict_values.append(ham_words_freq[r])
    if(ham_words_freq[r]>=freq_tolerance_low and ham_words_freq[r]<=freq_tolerance_high):
        bag_of_words_ham[r] = ham_words_freq[r]

In [46]:
spam_words_freq_dict_values1 = filter(lambda x: x>=freq_tolerance_low and x<=freq_tolerance_high,\
                                      spam_words_freq_dict_values)
plt.close()
fig_size=[9,6]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.figsize"]
plt.plot(range(0,len(spam_words_freq_dict_values1)),spam_words_freq_dict_values1)
plt.ylabel("Frequency count")
plt.xlabel("Word")
plt.title("Spam-Bag Of Words plot")
plt.savefig("Res/plot_spam_freq.jpg")

In [47]:
ham_words_freq_dict_values1 = filter(lambda x: x>=freq_tolerance_low and x<=freq_tolerance_high,\
                                      ham_words_freq_dict_values)
plt.close()
fig_size=[9,6]
plt.rcParams["figure.figsize"] = fig_size
plt.rcParams["figure.figsize"]
plt.plot(range(0,len(ham_words_freq_dict_values1)),ham_words_freq_dict_values1)
plt.ylabel("Frequency count")
plt.xlabel("Word")
plt.title("Ham-Bag Of Words plot")
plt.savefig("Res/plot_ham_freq.jpg")

In [19]:
# Defining a collection of words that are very RAREWORD
NOT_rareword = []
NOT_rareword.extend(bag_of_words_ham.keys())
NOT_rareword.extend(bag_of_words_spam.keys())

# Any word not in this list of NOT_garbage will be called as rareword
NOT_rareword = list(set(NOT_rareword))

with open('Res/NOT_rareword.txt', 'w') as f:
    for item in NOT_rareword:
        f.write("%s\n" % item)

In [20]:
# <-- SPAM messages -->

# Number of stopwords count in spam messages
spam_metadata_df["stopword_count"] = spam_metadata_df\
    .apply(lambda row: np.sum([True if x in stop_words else False for x in row["tokens"]]),axis=1)

# Number of useful words in the spam messages
spam_metadata_df["useful_tokens"] = spam_metadata_df\
    .apply(lambda row: filter(lambda x: x not in stop_words and 
                              x not in string.punctuation and 
                              x.isdigit()==False, row["tokens"]),axis=1)

# Median length of useful tokens in the spam messages
spam_metadata_df["median_useful_token_len"] = spam_metadata_df\
    .apply(lambda row: np.nanmedian([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Average length of useful tokens in the spam messages
spam_metadata_df["avg_useful_token_len"] = spam_metadata_df\
    .apply(lambda row: np.nanmean([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Extracting the rarewords in spam messages and storing them in a new column
spam_metadata_df["rareword_count"] = spam_metadata_df\
    .apply(lambda row: np.sum([True if x not in NOT_rareword else False for x in set(row["useful_tokens"])]),axis=1)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  from ipykernel import kernelapp as app


In [21]:
# <-- HAM messages -->

# Number of stopwords count in ham messages
ham_metadata_df["stopword_count"] = ham_metadata_df\
    .apply(lambda row: np.sum([True if x in stop_words else False for x in row["tokens"]]),axis=1)

# Number of useful words in the ham messages
ham_metadata_df["useful_tokens"] = ham_metadata_df\
    .apply(lambda row: filter(lambda x: x not in stop_words and 
                              x not in string.punctuation and 
                              x.isdigit()==False, row["tokens"]),axis=1)

# Median length of useful tokens in the message
ham_metadata_df["median_useful_token_len"] = spam_metadata_df\
    .apply(lambda row: np.nanmedian([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Average length of useful tokens in the message
ham_metadata_df["avg_useful_token_len"] = ham_metadata_df\
    .apply(lambda row: np.nanmean([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Extracting the rarewords in ham messages and storing them in a new column
ham_metadata_df["rareword_count"] = ham_metadata_df\
    .apply(lambda row: np.sum([True if x not in NOT_rareword else False for x in set(row["useful_tokens"])]),axis=1)

In [22]:
# Stemming and Lemmatization of the useful tokens
from nltk.stem import PorterStemmer
porter = PorterStemmer()

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

spam_metadata_df["stemmed_tokens"] = \
spam_metadata_df.apply(lambda row: [porter.stem(word) for word in filter(lambda x: len(x)>2 and is_ascii(x),row["useful_tokens"])],axis=1)

spam_metadata_df["lemma_tokens"] = \
spam_metadata_df.apply(lambda row: [wordnet_lemmatizer.lemmatize(word) for word in filter(lambda x: len(x)>2 and is_ascii(x),row["useful_tokens"])],axis=1)

ham_metadata_df["stemmed_tokens"] = \
ham_metadata_df.apply(lambda row: [porter.stem(word) for word in filter(lambda x: len(x)>2 and is_ascii(x),row["useful_tokens"])],axis=1)

ham_metadata_df["lemma_tokens"] = \
ham_metadata_df.apply(lambda row: [wordnet_lemmatizer.lemmatize(word) for word in filter(lambda x: len(x)>2 and is_ascii(x),row["useful_tokens"])],axis=1)

In [23]:
# The number of rarewords must be more for spam messages as they contain more vocabs and disparate words as compared to
# ham messages that contain simple and common words
print("Average for rarewords in spam messages = {}\nAverage for rarewords in ham  messages = {}"\
      .format(int(np.mean(spam_metadata_df[["rareword_count"]])),int(np.mean(ham_metadata_df[["rareword_count"]]))))
# which means more the count for rarewords would means it will be spam

Average for rarewords in spam messages = 40
Average for rarewords in ham  messages = 28


In [24]:
final_spam_ham_df = pd.concat([spam_metadata_df,ham_metadata_df])
final_spam_ham_df.to_csv("Res/processed_data/final_spam_ham_df.csv",sep=",",index=False)
print("Length of the final spam plus ham dataset = {}".format(len(final_spam_ham_df)))
print("Completed !")

Length of the final spam plus ham dataset = 33716
Completed !


In [25]:
final_spam_ham_df.groupby(["dataset","mail_label","owner_mail"]).count()["id"]

dataset  mail_label  owner_mail
enron1   ham         farmer        3672
         spam        GP            1500
enron2   ham         kaminski      4361
         spam        SA_and_HP     1496
enron3   ham         kitchen       4012
         spam        BG            1500
enron4   ham         williams      1500
         spam        GP            4500
enron5   ham         beck          1500
         spam        SA_and_HP     3675
enron6   ham         lokay         1500
         spam        BG            4500
Name: id, dtype: int64

In [26]:
final_spam_ham_df.head()

Unnamed: 0,id,mail_label,file_name,date_mail,owner_mail,content_message,subject_message,dataset,line_count,tokens,...,single_char_count,number_token_count,year_count,stopword_count,useful_tokens,median_useful_token_len,avg_useful_token_len,rareword_count,stemmed_tokens,lemma_tokens
0,4743,spam,Res/enron1/spam/4743.2005-06-25.GP.spam.txt,2005-06-25 00:00:00,GP,"Subject: what up , , your cam babe\r\nwhat are...","Subject: what up , , your cam babe",enron1,14,"[what, up, ,, ,, your, cam, babe, what, are, y...",...,4,0,0,59.0,"[cam, babe, looking, looking, companion, frien...",5.0,5.712500,46.0,"[cam, babe, look, look, companion, friendship,...","[cam, babe, looking, looking, companion, frien..."
1,1309,spam,Res/enron1/spam/1309.2004-06-08.GP.spam.txt,2004-06-08 00:00:00,GP,Subject: want to make more money ?\r\norder co...,Subject: want to make more money ?,enron1,9,"[want, to, make, more, money, ?, order, confir...",...,1,0,0,20.0,"[want, make, money, order, confirmation, order...",6.0,6.081081,8.0,"[want, make, money, order, confirm, order, shi...","[want, make, money, order, confirmation, order..."
2,0726,spam,Res/enron1/spam/0726.2004-03-26.GP.spam.txt,2004-03-26 00:00:00,GP,Subject: food for thoughts\r\n[\r\njoin now - ...,Subject: food for thoughts,enron1,6,"[food, for, thoughts, [, join, now, -, take, a...",...,1,0,0,6.0,"[food, thoughts, join, take, free, tour, click...",4.0,5.000000,0.0,"[food, thought, join, take, free, tour, click,...","[food, thought, join, take, free, tour, click,..."
3,0202,spam,Res/enron1/spam/0202.2004-01-13.GP.spam.txt,2004-01-13 00:00:00,GP,Subject: miningnews . net newsletter - tuesday...,Subject: miningnews . net newsletter - tuesday...,enron1,97,"[miningnews, ., net, newsletter, -, tuesday, ,...",...,33,58,17,239.0,"[miningnews, net, newsletter, tuesday, january...",6.0,6.264654,165.0,"[miningnew, net, newslett, tuesday, januari, t...","[miningnews, net, newsletter, tuesday, january..."
4,3988,spam,Res/enron1/spam/3988.2005-03-06.GP.spam.txt,2005-03-06 00:00:00,GP,Subject: your pharmacy ta\r\nwould you want ch...,Subject: your pharmacy ta,enron1,2,"[your, pharmacy, ta, would, you, want, cheap, ...",...,0,0,0,2.0,"[pharmacy, ta, would, want, cheap, perscriptio...",4.5,5.200000,5.0,"[pharmaci, would, want, cheap, perscript, http...","[pharmacy, would, want, cheap, perscriptions, ..."
5,0758,spam,Res/enron1/spam/0758.2004-04-02.GP.spam.txt,2004-04-02 00:00:00,GP,Subject: bigger breast just from a pill\r\nima...,Subject: bigger breast just from a pill,enron1,3,"[bigger, breast, just, from, a, pill, image, i...",...,13,0,0,39.0,"[bigger, breast, pill, image, loading, cli, k,...",5.0,5.517241,9.0,"[bigger, breast, pill, imag, load, cli, info, ...","[bigger, breast, pill, image, loading, cli, in..."
6,5145,spam,Res/enron1/spam/5145.2005-09-04.GP.spam.txt,2005-09-04 00:00:00,GP,Subject: cant find you on msn . . .\r\nbut rid...,Subject: cant find you on msn . . .,enron1,15,"[cant, find, you, on, msn, ., ., ., but, ride,...",...,3,0,0,28.0,"[cant, find, msn, ride, january, exercise, beb...",6.0,6.222222,47.0,"[cant, find, msn, ride, januari, exercis, bebu...","[cant, find, msn, ride, january, exercise, beb..."
7,4382,spam,Res/enron1/spam/4382.2005-04-26.GP.spam.txt,2005-04-26 00:00:00,GP,Subject: majestic rx\r\n- - - - 86738490004042...,Subject: majestic rx,enron1,8,"[majestic, rx, -, -, -, -, 8673849000404220545...",...,6,3,0,30.0,"[majestic, rx, hi, varou, quickly, work, take,...",6.0,6.183673,16.0,"[majest, varou, quickli, work, take, minut, lo...","[majestic, varou, quickly, work, take, minute,..."
8,2794,spam,Res/enron1/spam/2794.2004-11-11.GP.spam.txt,2004-11-11 00:00:00,GP,Subject: quick way to buy soft - ware\r\nvarie...,Subject: quick way to buy soft - ware,enron1,37,"[quick, way, to, buy, soft, -, ware, variety, ...",...,16,33,7,23.0,"[quick, way, buy, soft, ware, variety, top, ma...",6.0,6.642105,16.0,"[quick, way, buy, soft, ware, varieti, top, ma...","[quick, way, buy, soft, ware, variety, top, ma..."
9,1489,spam,Res/enron1/spam/1489.2004-07-03.GP.spam.txt,2004-07-03 00:00:00,GP,"Subject: so , it ' s me , varenukha\r\nsorry f...","Subject: so , it ' s me , varenukha",enron1,46,"[so, ,, it, ', s, me, ,, varenukha, sorry, for...",...,12,2,1,45.0,"[varenukha, sorry, taking, long, finally, foun...",7.0,6.800699,267.0,"[varenukha, sorri, take, long, final, found, s...","[varenukha, sorry, taking, long, finally, foun..."


In [48]:
print("Completed!")

Completed!
