# Fraud Detection Using Text - The Enron Scandal

![enron](enron.jpg)

## Welcome

## Dataset description

## Important libraries

In [1]:
# pandas & numpy:
import numpy as np
import pandas as pd

# visualization:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px

# tokenization:
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
import string

# gensim:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# scikit-learn:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


#keras:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences #We use this because some text are shorter than others. All text should have the same number of words

## Loading & understanding our data

In [2]:
df=pd.read_csv('Data/emails_cleaned.csv')
# With "pd.set_option('display.max_columns', None)" we see all the columns of the dataset.
pd.set_option('display.max_columns', None)
df.head()


KeyboardInterrupt: 

In [None]:
def clean_data(df):
    
    df=df.drop(columns=['file', 'message', 'Cc', 'Mime-Version', 'Content-Type','Content-Transfer-Encoding','Date',
                        'Bcc', 'X-From', 'X-To', 'X-cc', 'X-bcc','X-Folder', 'X-Origin', 'X-FileName', 'has_other_content', 'if_forwarded'])
    df.columns=[e.lower().replace(' ', '_') for e in df.columns]
    
    return df

In [None]:
df=clean_data(df)
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna().reset_index()
df

In [None]:
#we want to join all text data in one single column.We will create a new column called "completed_text"
df["completed_text"]=df["subject"]+df["content"]

In [None]:
df

In [None]:
# Let's see how looks the first element of the column "completed_text"
df['completed_text'][0]

In [None]:
# Now that we have joined the columns "subject" and "content" into one column, we can delete them 
df=df.drop(columns=['content', 'subject'],axis=1)

## Fraud flags

In [None]:
# Now that we have all the text data together, we can flag some terms as "fraud suspect"
# WE know that Enron employees activaley participated in the fraud by keeping the stock price manually high. 
# We can create a list of fraudulent terms that helps us to find the emails with reference to the word "stock".

fraud_list=['stock','enron stock','sell stock','bonus','wall street','board','the market','dow jones']

# Now we filter the column completed text using the list "fraud_list".


filtered_emails = df.loc[df['completed_text'].str.contains('|'.join(fraud_list), na=False)]
print(filtered_emails)


In [None]:
# "Filtered_emails" represents the emails with fraudulent terminology. 
# We will create a new column named fraud in the dataframe. 
# The new column will have 2 values: 0: non-fraud ; 1: fraud

df['fraud'] = np.where((df['completed_text'].str.contains('|'.join(fraud_list)) == True), 1, 0)
        
df      

In [None]:
count = df['fraud'].value_counts()
print(count)

In [None]:
#Plot the count plot for fraudulent and no fraudulent emails
plt.figure(figsize = (4, 4))
sns.countplot(y = "fraud", data = df,palette='vlag')
plt.title('Fraud Distributions \n (0:Non_Fraud || 1: Fraud)', fontsize=11)
plt.show()

## Detecting fraud using text mining

In order to clean our text data we need to follow these steps:
    <ol>
  <li>Tokenization</li>
  <li>Remove all Stopwords</li>
  <li>Lemanize your words</li>
  <li>Stem your words</li>
</ol>

In [None]:
#now he are going to get the stopwords for English
stop_words = list(stopwords.words('english')) 
stop_words

In [None]:
#get additional stop words from nltk
stop_words.extend(['from','to','cc','http', 're', 'www', 'com'])

In [None]:
# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)
            
    return result

In [None]:
# Apply the function to the dataframe
df['clean_text'] = df['completed_text'].apply(preprocess)

In [None]:
# Now we can compare the columns "completed_text" and "clean_text"
#Show completed_text news
df['completed_text'][0]

In [None]:
print(df['clean_text'][0])

In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for i in df.clean_text:
    for j in i:
        list_of_words.append(j)

In [None]:
list_of_words

In [None]:
len(list_of_words)

In [None]:
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words

In [None]:
# join the words into a string
df['clean_joined'] = df['clean_text'].apply(lambda x: " ".join(x))

In [None]:
df['clean_joined'][0]

## Visualize cleaned up dataset

In [None]:
df

In [None]:
# plot the word cloud for text that is considered "fraud"
plt.figure(figsize = (7,7)) 
wc = WordCloud(max_words = 2000 , width = 1920 , height = 1080 ,colormap='vlag', stopwords = stop_words).generate(" ".join(df[df.fraud == 1].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# plot the word cloud for text that is considered "non-fraud"
plt.figure(figsize = (7,7)) 
wc = WordCloud(max_words = 2000 , width = 1920 , height = 1080 ,colormap='vlag', stopwords = stop_words).generate(" ".join(df[df.fraud == 0].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# length of maximum document will be needed to create word embeddings 
maxlen = -1
for doc in df.clean_joined:
    tokens = nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any email is =", maxlen)

In [None]:
# visualize the distribution of number of words in an email.

plt.figure(figsize = (5, 5))
sns.histplot(x = [len(nltk.word_tokenize(x)) for x in df.clean_joined], color='#366f88', bins = 100)
plt.show()

## Prepare the data by perfoming tokenization and padding

Tokenizer allows us to vectorize text corpus by turning each text into a sequence of integers

In [None]:
X=df['clean_joined']
y=df['fraud']

In [None]:
# split data into test and train 

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)

In [None]:
# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
print("The encoding for document:\n",df.clean_joined[0],"\n is : ",train_sequences[0])

In [None]:
# Add padding can either be maxlen = 4406 or smaller number maxlen = 40 seems to work well based on results
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post') 

In [None]:
for i,doc in enumerate(padded_train[:1]):
     print("The padded encoding for document",i+1," is : ",doc)

## Built the model

In [None]:
model= RandomForestClassifier(random_state=42, n_estimators=10, max_depth=None)

# embeddidng layer


result=model.fit(X_train, y_train)
y_pred=result.predict(X_test)


print("Classification report for the test set")
print(classification_report(y_test,y_pred))

print('Confusion matrix')
print(confusion_matrix(y_test,y_pred))
