# AI Research - Phishing Detection

## Import

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import wordcloud
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from langdetect import detect
from sklearn.model_selection import GridSearchCV
import enchant 
from keras.layers import Input, Dense, Embedding, Flatten, Dropout
from keras.models import Model

Shortcut:

In [58]:
# Read cleaned Data from CSV
df_train = pd.read_csv("Datasets/cleaned/train.csv")
df_valid = pd.read_csv("Datasets/cleaned/validation.csv")
df_test = pd.read_csv("Datasets/cleaned/test.csv")
df_gpt = pd.read_csv("Datasets/cleaned/gpt.csv")
df = pd.read_csv("Datasets/cleaned/spam.csv")

The datasets we work with in the further steps are loaded. 

In [14]:
df = pd.read_csv("Datasets/spam2.csv")
df_train_roh = pd.read_csv("Datasets/train.csv")
df_test_roh = pd.read_csv("Datasets/test.csv")
df_valid_roh = pd.read_csv("Datasets/validation.csv")
df_gpt = pd.read_csv("gpt.csv")

In the preprocessing step we lemmatize the dataset and remove stopwords. Therefore we use NLTK.

In [51]:
nltk.download ("wordnet")
nltk.download ("stopwords")
stopWords = set(stopwords.words('english'))
regexp = RegexpTokenizer('\w+')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Clean up old Dataframe

In [16]:
# Rename the Columns to one uniform format
df_train_roh = df_train_roh.rename(columns={"sentence1": "text"})
df_valid_roh = df_valid_roh.rename(columns={"sentence1": "text"})
df_test_roh = df_test_roh.rename(columns={"sentence1": "text"})
df = df.rename(columns={"v2": "text", "v1": "label"})

# Replace Labels
df.label = df.label.str.replace("ham", "normal")

# Drop NaN values
df = df.loc[np.logical_and(df.label.notnull(), df.text.notnull())]
df_train_roh = df_train_roh.loc[np.logical_and(df_train_roh.label.notnull(), df_train_roh.text.notnull())]
df_valid_roh = df_valid_roh.loc[np.logical_and(df_valid_roh.label.notnull(), df_valid_roh.text.notnull())]
df_test_roh = df_test_roh.loc[np.logical_and(df_test_roh.label.notnull(), df_test_roh.text.notnull())]

# Drop Columns
df_train_roh.drop("id", axis=1, inplace=True)
df_valid_roh.drop("id", axis=1, inplace=True)
df_test_roh.drop("id", axis=1, inplace=True)
df = df[["text", "label"]]

### Import GPT generated test Mails

## Preprocessing

To prepare our data to train our model, we go through typical preprocessing steps. Therefore we create a lemmatize function, which we can apply on the different datasets. In this part the data is also upsampled, so that tere is the same amount of spam mails and normal mails in the dataset.

In [52]:
dic = enchant.Dict("en_US")

In [53]:
def is_english(text):
    try:
        if detect(text) == "en":
            return 1
        else:
            return 0
    except:
        return 0

wnl = WordNetLemmatizer()
def Lemmatize(x):
    x = regexp.tokenize(x)
    text = ""
    for i in x:
        if i not in stopWords and dic.check(i):
            #lemm = wnl.lemmatize(i)
            text += i + " "
    return text

In [22]:
# Daten mit label "normal" herausfiltern
df_train_normal = df_train_roh.loc[df_train_roh.label == "normal"].copy()
df_valid_normal = df_valid_roh.loc[df_valid_roh.label == "normal"].copy()
df_test_normal = df_test_roh.loc[df_test_roh.label == "normal"].copy()

In [23]:
# Selbe Menge an Daten mit label "spam" herausfiltern
df_train_spam = df_train_roh.loc[df_train_roh.label == "spam"].sample(len(df_train_normal))
df_valid_spam = df_valid_roh.loc[df_valid_roh.label == "spam"].sample(len(df_valid_normal))
df_test_spam = df_test_roh.loc[df_test_roh.label == "spam"].sample(len(df_test_normal))

In [24]:
# Daten zusammenführen
df_train = pd.concat([df_test_normal, df_train_spam])
df_valid = pd.concat([df_valid_normal, df_valid_spam])
df_test = pd.concat([df_train_normal, df_test_spam])

In [26]:
# Daten mischen
df_train = df_train.sample(frac=1)
df_valid = df_valid.sample(frac=1)
df_test = df_test.sample(frac=1)

In [25]:
# Daten vorbereiten
df_train["text"] = df_train.text.apply(Lemmatize)
df_valid["text"] = df_valid.text.apply(Lemmatize)
df_test["text"] = df_test.text.apply(Lemmatize)
df_gpt.text = df_gpt.text.apply(Lemmatize)
df.text = df.text.apply(Lemmatize)

In [None]:
df_train["english"] = df_train.text.apply(is_english)
df_test["english"] = df_test.text.apply(is_english)
df["english"] = df.text.apply(is_english)

In [28]:
df_train = df_train.loc[df_train.english == 1]
df_test = df_test.loc[df_test.english == 1]
df = df.loc[df.english == 1]

### train & test datasets

In the following steps we prepare the data of the train and test dataset. The data comes from one dataset, which we split into training and test data. Therefore we rename the columns, remove a column named "id", that is not important for the next steps, and check if the datasets contains data that is not available. 

We searched for further features, except of the text itself, that we can use to train the model. Therefore we create to functions to see if the text contains links and ip adresses. With the following function we check if the the texts contain links. We want to create a new column with this information, so we can use it to train the model later on. We also create a column with the lenght of the text. 

In [31]:
def containslink(text):
  pattern = r"(http|ftp|https)://([\w-]+(?:(?:.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
  return int(bool(re.search(pattern, text)))

df_train['contains_link'] = df_train['text'].apply(containslink)
df_test['contains_link'] = df_test['text'].apply(containslink)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['contains_link'] = df_train['text'].apply(containslink)


In [32]:
def containsip(text):
  pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
  return int(bool(re.search(pattern, text)))

df_train['contains_ip'] = df_train['text'].apply(containsip)
df_test['contains_ip'] = df_test['text'].apply(containsip)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['contains_ip'] = df_train['text'].apply(containsip)


In [33]:
df_train['length'] = df_train['text'].apply(len)
df_test['length'] = df_test['text'].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['length'] = df_train['text'].apply(len)


In [34]:
df_train

Unnamed: 0,text,label,english,contains_link,contains_ip,length
0,Free shipping jewelry accessories orders today...,spam,1,0,0,66
2,U Room may anything Chad anyone would somethin...,spam,1,0,0,827
3,Fantastic luxury items less half price Want Di...,spam,1,0,0,59
4,Dreams achievable websites like let enjoy life...,spam,1,0,0,76
5,place Well standing She engineered teeth detai...,spam,1,0,0,1416
...,...,...,...,...,...,...
1503,PUBLIC 1 0 Transitional EN org TR transitional...,spam,1,0,0,6192
1504,We help recharge health skull,spam,1,0,0,30
1505,Any man last 40 minutes MEN S JOURNAL Health P...,spam,1,0,0,1969
1506,M hate It decision entirely blasphemy meriting...,spam,1,0,0,2885


# SVM

In [13]:
cv = CountVectorizer()
features = cv.fit_transform(df.text)

model = svm.SVC()
model.fit(features, df.label)

Test if the model detect our testMail.txt as a spam mail:

In [14]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['normal']


In [15]:
features_test = cv.transform(df.text)
print(model.score(features_test,df.label))

0.9967695620961953


The SVM model, that we trained with the spam2 dataset, has an accuracy of 0.99.

### SVM - train dataset

Training the model with the train dataset:

In [17]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:10000].text)

In [18]:
model = svm.SVC(degree=1)
model.fit(features,df_train.iloc[0:10000].label)

In [19]:
features_test = cv.transform(df_test.iloc[0:10000].text)
print(model.score(features_test,df_test.iloc[0:10000].label))

0.987


We test the model with the test dataset and get an accuracy of 0.98.

Training the model with the test data to compare if it has a better accuracy.

In [16]:
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]

features_test = cv.transform(df_test.text)
print(model.score(features_test,df_test.label))

0.04664433451485997


The testmail is now detected as spam:

In [20]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['spam']


# Gaussian Naive Bayes Classifier

We limit our the test and training dataset by using only the first 2000 lines for training because of the size of the dataset and therefore resulting performance issues. 
We train the model with the **train** data.

In [20]:
df_train = df_train.loc[np.logical_and(df_train.label.notnull(), df_train.text.notnull())]
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]
df_valid = df_valid.loc[np.logical_and(df_valid.label.notnull(), df_valid.text.notnull())]
df_gpt = df_gpt.loc[np.logical_and(df_gpt.label.notnull(), df_gpt.text.notnull())]
df = df.loc[np.logical_and(df.label.notnull(), df.text.notnull())]

In [21]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:3500].text).toarray()
valid_features = cv.transform(df_valid.iloc[0:3500].text).toarray()
test_features = cv.transform(df_test.iloc[0:3500].text).toarray()
gpt_features = cv.transform(df_gpt.iloc[0:3500].text).toarray()
df_features = cv.transform(df.iloc[0:3500].text).toarray()

Test the Model

In [25]:
gnb.score(feat, df_test.iloc[:3500].label)

0.9631428571428572

In [15]:
y_pred = gnb.predict(valid_features)
print("Number of mislabeled points out of a total %d points : %d" % (valid_features.shape[0], (df_valid.iloc[:3500].label != y_pred).sum()))

Number of mislabeled points out of a total 2240 points : 85


# Lineare Regression

Text-Analyse mit Linearer Regression

In [132]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:2000].text).toarray()
test_features = cv.transform(df_test.iloc[0:2000].text).toarray()

In [134]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)

In [135]:
reg = LinearRegression().fit(features, label)

In [144]:
reg.score(test_features, test_label)

-3.0893262944415882

In [149]:
reg = LogisticRegression().fit(features, label)
reg.score(test_features, test_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.992

In [192]:
reg.score(test_features, test_label)

0.14

Lineare Regression ohne Text-Analyse

In [150]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

data = df_train[["contains_link", "contains_ip", "length"]].iloc[:2000]
test_data = df_test[["contains_link", "contains_ip", "length"]].iloc[:2000]
label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)
data["prediction"] =  reg.predict(features)
test_data["prediction"] = reg.predict(test_features)

reg2 = LinearRegression().fit(data, label)

In [152]:
reg2.score(test_data, test_label)

0.628608423279085

# Decision Tree Classifier

In [84]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.text)
test_features = cv.transform(df_test.text)
spam2_features = cv.transform((df.text))

In [None]:
DcsTree = DecisionTreeClassifier()
DcsTree.fit(features, labels)

In [None]:
DcsTree.score(spam2_features, df.label)

# Random Forest

In [88]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.text)
test_features = cv.transform(df_test.text)
spam2_features = cv.transform((df.text))

In [None]:
RndFrst = RandomForestClassifier()
RndFrst.fit(features, df_train.label)

##### Test mit selben Dataset

In [90]:
RndFrst.score(test_features, df_test.label)

0.9913333333333333

##### Test mit anderem Dataset

In [91]:
RndFrst.score(spam2_features, df.label)

0.15466666666666667

# Gridsearch

In [92]:
# Define the parameters you want to search over
param_grid = {'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'splitter': ["best", "random"]}

# Create a decision tree model
model = DecisionTreeClassifier()

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(features, df_train.label)

# Print the best hyperparameters found by the grid search
print(grid_search.best_params_)

{'max_depth': 1, 'min_samples_leaf': 1, 'splitter': 'best'}


In [None]:
DcsTree = DecisionTreeClassifier(max_depth = 1, min_samples_leaf = 1, splitter = 'best')
DcsTree.fit(features, df_train.label)
print(f"Score mit Testdaten aus dem selben Dataset: {DcsTree.score(test_features, df_test.label)}")
print(f"Score mit Testdaten aus dem alten Dataset: {DcsTree.score(spam2_features, df.label)}")

In [103]:
# Define the parameters you want to search over
param_grid = {
                'max_depth': [1, 2, 3, 4, 5], 
                'min_samples_leaf': [1, 2, 3, 4, 5],}

# Create a decision tree model
model = RandomForestClassifier()

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(features, df_train.label)

# Print the best hyperparameters found by the grid search
print(grid_search.best_params_)

{'max_depth': 1, 'min_samples_leaf': 1}


In [None]:
RndFrst = RandomForestClassifier(max_depth = 1, min_samples_leaf = 1)
RndFrst.fit(features, df_train.label)
print(f"Score mit Testdaten aus dem selben Dataset: {RndFrst.score(test_features, df_test.label)}")
print(f"Score mit Testdaten aus dem alten Dataset: {RndFrst.score(spam2_features, df.label)}")

# Neural Network with Keras

Prepare the Data

In [159]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:3500].text).toarray()
labels = df_train.iloc[0:3500].label.factorize()[0]
valid_features = cv.transform(df_valid.iloc[0:3500].text).toarray()
valid_labels = df_valid.iloc[0:3500].label.factorize()[0]
test_features = cv.transform(df_test.iloc[0:3500].text).toarray()
test_labels = df_test.iloc[0:3500].label.factorize()[0]
gpt_features = cv.transform(df_gpt.iloc[0:3500].text).toarray()
gpt_labels = df_gpt.iloc[0:3500].label.factorize()[0]
df_features = cv.transform(df.iloc[0:3500].text).toarray()
df_labels = df.iloc[0:3500].label.factorize()[0]

In [84]:
# Concatenate all the data
data = pd.concat([df_train, df_test, df_valid, df, df_gpt])

# Mix data
data = data.sample(frac=1).reset_index(drop=True)

# Split data into train, validation and test
data_train, data_test = train_test_split(data, test_size=0.1)
data_train, data_valid = train_test_split(data_train, test_size=0.5)

# Vectorize the data
cv = CountVectorizer()
cv.fit(data.text)
features = cv.transform(data_train.text).toarray()
test_features = cv.transform(data_test.text).toarray()
valid_features = cv.transform(data_valid.text).toarray()

# Factorize the labels
labels = data_train.label.apply(lambda x: 1 if x == 'spam' else 0)
test_labels = data_test.label.apply(lambda x: 1 if x == 'spam' else 0)
valid_labels = data_valid.label.apply(lambda x: 1 if x == 'spam' else 0)

# Cut training and validation data to the same length
samples = features.shape[0] - 10
features = features[:samples]
labels = labels[:samples]
valid_features = valid_features[:samples]
valid_labels = valid_labels[:samples]

# transform old data 
df_features = cv.transform(df.text).toarray()
gpt_features = cv.transform(df_gpt.text).toarray()
old_test_features = cv.transform(df_test.text).toarray()

# Vectorize old labels
df_labels = df.label.apply(lambda x: 1 if x == 'spam' else 0)
gpt_labels = df_gpt.label.apply(lambda x: 1 if x == 'spam' else 0)
old_test_labels = df_test.label.apply(lambda x: 1 if x == 'spam' else 0)

In [86]:
def train_network(features, labels, valid_features, valid_labels):
    # Define the input layer
    input_layer = Input(shape=(features.shape[1],))

    # Define the hidden layers
    hidden_layer = Dense(units=256, activation='relu')(input_layer)
    hidden_layer = Dropout(0.2)(hidden_layer)
    hidden_layer = Dense(units=128, activation='relu')(hidden_layer)

    # Define the output layer
    output_layer = Dense(units=1, activation='sigmoid')(hidden_layer)

    # Create the model
    model = Model(inputs=input_layer, outputs=output_layer)

    # Compile the model
    model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(features, labels, validation_data=[valid_features, valid_labels], epochs=6, batch_size=32)

    return model

In [87]:
model = train_network(features, labels, valid_features, valid_labels)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [88]:
model.evaluate(valid_features, valid_labels)



[0.15895545482635498, 0.9648559093475342]

In [89]:
model.evaluate(test_features, test_labels)



[0.18650828301906586, 0.9607400894165039]

In [90]:
model.evaluate(df_features, df_labels)



[0.13335511088371277, 0.9609755873680115]

In [91]:
model.evaluate(gpt_features, gpt_labels)



[0.06460397690534592, 0.9885714054107666]

In [92]:
model.evaluate(old_test_features, old_test_labels)



[0.08588790893554688, 0.9856407642364502]

# Tests with manually written Mails

In [72]:
df_man = pd.DataFrame({
    "text": [
        "Dear Customer, we are happy to inform you that you have won an iPhone 17 in our annual customer lottery. Please click on the link to claim your price. Warning: the link will expire within 7 Days and so will your price. ",
        "Warning, your bank account has been compromised. It seems like your account has been hacked. Click on the link to prevent your account from being deactivated.",
        "Attention last warning! The bill with the number 123 is still open. Please pay the invoice within 7 days. Otherwise, we will be forced to take legal action.",
        "Viagra without prescription. Buy Viagra without a doctor's prescription on our website. We are the market leader for sexual enhancers. Buy cheap and effective medicine now.",
        "Dear Peter, I am writing to you regarding the appointment on Thursday. Unfortunately, I can only in the afternoon. Would you mind postponing the date?",
        "Dear Mr. Soundso, as we already discussed during our last conversation, the delivery on Thursday can take place as planned. Please send me the exact address and your phone number.",
        "Dear customer, enclosed you will find the invoice for your order. Please note that the invoice has already been paid. She is only for her records.",
        "Students Attention, this semester our summer camp takes place again. You can expect 3 weeks of hard work in the labour camp. The course can be credited with half an ECTS."
        ],
    "label":["spam", "spam", "spam", "spam", "normal", "normal", "normal", "normal"]
})

In [93]:
man_features = cv.transform(df_man.text).toarray()
man_labels = df_man.label.apply(lambda x: 1 if x == 'spam' else 0)

In [94]:
model.evaluate(man_features, man_labels)



[0.3966676592826843, 0.875]

In [96]:
df_man

Unnamed: 0,text,label
0,"Dear Customer, we are happy to inform you that...",spam
1,"Warning, your bank account has been compromise...",spam
2,Attention last warning! The bill with the numb...,spam
3,Viagra without prescription. Buy Viagra withou...,spam
4,"Dear Peter, I am writing to you regarding the ...",normal
5,"Dear Mr. Soundso, as we already discussed duri...",normal
6,"Dear customer, enclosed you will find the invo...",normal
7,"Students Attention, this semester our summer c...",normal


In [118]:
test = pd.Series(["Dear Mr. Soundso, as we already discussed during our last conversation, the delivery on Thursday can take place as planned. Please send me the exact address and your phone number"])
test = cv.transform(test)

In [122]:
model.predict(test)[0][0] > 0.5



False