## Conducting the final study by changing the combination of the data from the eron dataset

In [1]:
# Importing the required libraries
import pandas as pd
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import numpy as np
import os
os.chdir("..")



In [2]:
# Reading the input from the code file 5 where we extracted the raw mails from the raw data supplied 
df = pd.read_csv("Res/accepted_raw_data/filtered_raw_data.csv",sep=",")
df.rename(columns={"content_mail":"content_entire_mail","filepath":"file_name","owner":"owner_mail"},inplace=True)

In [3]:
# Functions to extract the messgaes from the mails supplied in the raw mails
def html_extract(html):
    soup = BeautifulSoup(html)
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text
def extract_content(entire_mail):
    if("<html" in entire_mail.lower() and "</html>" in entire_mail.lower()):
        entire_mail_lower = entire_mail.lower().replace("<html>","\n<html>\n").replace("</html>","\n</html>\n")
        a = entire_mail_lower.split("\n")
        actual = entire_mail.split("\n")
        try:
            start = a.index(filter(lambda x: x.startswith('<html'), a)[0])
            end = a.index("</html>")+1
            html = "\n".join(actual[start:end])
        except:
            html = entire_mail[entire_mail_lower.find("<html"):entire_mail_lower.rfind("</html>")]+"</html>"
        return html_extract(html)
    elif("Content-Type: text/plain" in entire_mail):
        try:
            a = entire_mail.split("Content-Type: text/plain")[-1].split("\r\n")
            return a[a.index("")+1]
        except:
            try:
                a = entire_mail.split("Content-Type: text/plain")[-1].split("\n")
                return a[a.index("")+1]
            except:
                return "False"
    else:
        try:
            html = "<html>"+entire_mail[entire_mail.lower().find("<body"):entire_mail.lower().rfind("</body>")]+"</body></html>"
            return html_extract(html)
        except:
            return "False"
df["content_message"] = df.apply(lambda row: extract_content(row["content_entire_mail"]),axis=1)
df = df.loc[df["content_message"]!=False]

In [4]:
# Stats after cleaning and extracting the data from the enron raw data
df.groupby(["owner_mail","mail_label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,file_name,content_entire_mail,content_message
owner_mail,mail_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BG,spam,1664,1664,1664
GP,spam,13703,13703,13703
SH,spam,7780,7780,7780
beck-s,ham,1966,1966,1966
farmer-d,ham,3669,3669,3669
kaminski-v,ham,4363,4363,4363
kitchen-l,ham,4012,4012,4012
lokay-m,ham,2364,2364,2364
williams-w3,ham,2714,2714,2714


## Creating the same features that we created for the enron 1 to 6 data

In [5]:
# Counting the number of lines including the subject line
df["line_count"] = df.apply(lambda row: row["content_message"].replace("\r\n","\n")
                            .replace("\n\n","\n").count("\n"),axis=1)

# Extracting the tokens out of the messages in a column called "tokens"
df["tokens"] = df.apply(lambda row: row["content_message"]\
    .replace("\\r\\n"," ").replace("\\n\\n"," ").replace("."," . ").replace(","," , ").replace(";"," ; ")\
    .replace(":"," : ").replace("!"," ! ").replace("?"," ? ").replace("$"," $ ")\
    .replace("-"," - ").replace("="," = ").replace("&"," & ").replace("/"," / ").split(),axis=1)

# Creating a column to count the total number of tokens
df["token_count"] = df.apply(lambda row: len(row["tokens"]),axis=1)

# Creating a column to count the number of punctuations
df["punctuations_count"] = df.apply(lambda row: sum([True if x in string.punctuation else False for x in row["tokens"]]),axis=1)

# Single character count in the list of tokens
df["single_char_count"] = df.apply(lambda row: sum([True if (len(x)==1 and x not in string.punctuation) else False for x in row["tokens"]]),axis=1)

# Column to count the tokens that are numbers 
df["number_token_count"] = df.apply(lambda row: sum([True if x.isdigit() else False for x in row["tokens"]]),axis=1)

# Tokens that are years mentioned in between 1970 to 2050
df["year_count"] = df.apply(lambda row: sum([True if (x >= "1970" and x <= "2050") else False for x in row["tokens"]]),axis=1)


In [6]:
stop_words = set(stopwords.words('english'))
NOT_rareword=[]
f = open('Res/NOT_rareword.txt', 'r')
for word in f:
    NOT_rareword.append(word.split("\n")[0])

# Number of stopwords count in spam messages
df["stopword_count"] = df.apply(lambda row: np.sum([True if x in stop_words else False \
                                                    for x in row["tokens"]]),axis=1)

# Number of useful words in the spam messages
df["useful_tokens"] = df.apply(lambda row: filter(lambda x: x not in stop_words and \
                              x not in string.punctuation and \
                              x.isdigit()==False, row["tokens"]),axis=1)

# Median length of useful tokens in the spam messages
df["median_useful_token_len"] = df.apply(lambda row: np.nanmedian([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Average length of useful tokens in the spam messages
df["avg_useful_token_len"] = df\
    .apply(lambda row: np.nanmean([len(x) if len(x)!=1 else np.nan for x in row["useful_tokens"]]),axis=1)

# Extracting the rarewords in spam messages and storing them in a new column
df["rareword_count"] = df\
    .apply(lambda row: np.sum([True if x not in NOT_rareword else False for x in set(row["useful_tokens"])]),axis=1)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [7]:
# Stemming and Lemmatization of the useful tokens
from nltk.stem import PorterStemmer
porter = PorterStemmer()

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

df["stemmed_tokens"] = df.apply(lambda row: [porter.stem(word) 
                                             for word in filter(lambda x: len(x)>2 and 
                                                                is_ascii(x),row["useful_tokens"])],axis=1)

df["lemma_tokens"] = df.apply(lambda row: [wordnet_lemmatizer.lemmatize(word) 
                                           for word in filter(lambda x: len(x)>2 and 
                                                              is_ascii(x),row["useful_tokens"])],axis=1)

In [8]:
useful_tokens_dict = eval(open('Res/useful_tokens_dict.txt', 'r').read())
def intersect_useful_tokens(lemma_tokens_row):
    return (list(set(map(lambda x: x.lower(), lemma_tokens_row))&set(useful_tokens_dict.keys())))
df["attributes"] = df.apply(lambda row: intersect_useful_tokens(row["lemma_tokens"]),axis=1)
df["attributes_len"] = df.apply(lambda row: float(len(row["attributes"])),axis=1)
df["corpus"] = df.apply(lambda row: " ".join(row["attributes"]),axis=1)

In [9]:
# Building the tf-idf data frame
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = list(df["corpus"])
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
attributes_df = pd.DataFrame(X.todense(),columns = vectorizer.get_feature_names())
df = pd.concat([df,attributes_df],axis=1)

In [10]:
# Dropping the columns "content_entire_mail","content_message" and writing the complete dataset into complete_data_attributes_df.csv
df1 = df.drop(["content_entire_mail","content_message"],axis=1)
df1.to_csv("Res/Complete_data_processed/complete_data_attributes_df.csv",sep=",",index=False)

In [11]:
# Creating the new Enron datasets 7 to 12 from the raw data
enron7 = pd.concat([df.loc[df["owner_mail"]=="farmer-d"],df.loc[df["owner_mail"]=="BG"].sample(n=1500,random_state=7)])
enron8 = pd.concat([df.loc[df["owner_mail"]=="kaminski-v"],df.loc[df["owner_mail"]=="GP"].sample(n=1500,random_state=8)])
enron9 = pd.concat([df.loc[df["owner_mail"]=="lokay-m"],df.loc[df["owner_mail"]=="SH"].sample(n=1500,random_state=9)])
enron10 = pd.concat([df.loc[df["owner_mail"]=="williams-w3"].sample(n=1500,random_state=10),df.loc[df["owner_mail"]=="BG"]])
enron11 = pd.concat([df.loc[df["owner_mail"]=="beck-s"].sample(n=1500,random_state=11),df.loc[df["owner_mail"]=="GP"].sample(n=4500,random_state=11)])
enron12 = pd.concat([df.loc[df["owner_mail"]=="kitchen-l"].sample(n=1500,random_state=12),df.loc[df["owner_mail"]=="SH"].sample(n=4500,random_state=12)])

In [12]:
# Removing those mails that have no text inside them
enron7.drop(enron7.index[np.where(np.isnan(enron7[["median_useful_token_len"]]))[0]],inplace=True)
enron8.drop(enron8.index[np.where(np.isnan(enron8[["median_useful_token_len"]]))[0]],inplace=True)
enron9.drop(enron9.index[np.where(np.isnan(enron9[["median_useful_token_len"]]))[0]],inplace=True)
enron10.drop(enron10.index[np.where(np.isnan(enron10[["median_useful_token_len"]]))[0]],inplace=True)
enron11.drop(enron11.index[np.where(np.isnan(enron11[["median_useful_token_len"]]))[0]],inplace=True)
enron12.drop(enron12.index[np.where(np.isnan(enron12[["median_useful_token_len"]]))[0]],inplace=True)

In [13]:
# Writing the individual files enron 7 to 12 to respective file names csv
enron7.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron7.csv",sep=",",index=False)
enron8.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron8.csv",sep=",",index=False)
enron9.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron9.csv",sep=",",index=False)
enron10.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron10.csv",sep=",",index=False)
enron11.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron11.csv",sep=",",index=False)
enron12.drop(["content_entire_mail","content_message"],axis=1).to_csv("Res/Complete_data_processed/enron12.csv",sep=",",index=False)

In [14]:
print("Done!")

Done!
