# AI Research - Phishing Detection

## Import

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import wordcloud
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from langdetect import detect
from sklearn.model_selection import GridSearchCV

The datasets we work with in the further steps are read in. 

In [38]:
df = pd.read_csv("Datasets/spam2.csv")
df_train = pd.read_csv("Datasets/train.csv")
df_test = pd.read_csv("Datasets/test.csv")
df_valid = pd.read_csv("Datasets/validation.csv")

df_train = df_train.iloc[:2000]
df_test = df_test.iloc[:2000]

In the preprocessing step we lemmatize the dataset and remove stopwords. Therefore we use NLTK.

In [39]:
nltk.download ("wordnet")
nltk.download ("stopwords")
stopWords = set(stopwords.words('english'))
regexp = RegexpTokenizer('\w+')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Clean up old Dataframe

In [40]:
df = df.rename(columns={"v2": "text", "v1": "label"})
df.label = df.label.str.replace("ham", "normal")
df = df[["text", "label"]]
df = df.loc[np.logical_and(df.label.notnull(), df.text.notnull())]

In [41]:
df_train = df_train.rename(columns={"sentence1": "text"})
df_test = df_test.rename(columns={"sentence1": "text"})

df_train = df_train.loc[np.logical_and(df_train.label.notnull(), df_train.text.notnull())]
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]

df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [78]:
df_train = df_train.loc[df_train.english == 1]
df_test = df_test.loc[df_test.english == 1]
df = df.loc[df.english == 1]

In [82]:
df_train = df_train.iloc[:1500]
df_test = df_test.iloc[:1500]
df = df.iloc[:1500]

## Preprocessing

### spam2

To prepare our data to train our model, we go through typical preprocessing steps. Therefore we create a lemmatize function, which we can apply on the different datasets.

In [42]:
def is_english(text):
    try:
        if detect(text) == "en":
            return 1
        else:
            return 0
    except:
        return 0

wnl = WordNetLemmatizer()
def Lemmatize(x):
    x = regexp.tokenize(x)
    text = ""
    for i in x:
        if i not in stopWords:
            lemm = wnl.lemmatize(i)
            text += lemm + " "
    return text

In [43]:
df_train["english"] = df_train.text.apply(is_english)
df_test["english"] = df_test.text.apply(is_english)
df["english"] = df.text.apply(is_english)

We lemmatize the spam2 dataset and create the columns "text" and "label". The "text" column contains the text of the mail and the "label" column contains the label, if the text is spam or ham. 

In [44]:
df_train.text = df_train.text.transform(Lemmatize)
df_test.text = df_test.text.transform(Lemmatize)
df.text = df.text.transform(Lemmatize)

### train & test datasets

In the following steps we prepare the data of the train and test dataset. The data comes from one dataset, which we split into training and test data. Therefore we rename the columns, remove a column named "id", that is not important for the next steps, and check if the datasets contains data that is not available. 

We searched for further features, except of the text itself, that we can use to train the model. Therefore we create to functions to see if the text contains links and ip adresses. With the following function we check if the the texts contain links. We want to create a new column with this information, so we can use it to train the model later on. We also create a column with the lenght of the text. 

In [28]:
def containslink(text):
  pattern = r"(http|ftp|https)://([\w-]+(?:(?:.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
  return int(bool(re.search(pattern, text)))

df_train['contains_link'] = df_train['text'].apply(containslink)
df_test['contains_link'] = df_test['text'].apply(containslink)


In [29]:
def containsip(text):
  pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
  return int(bool(re.search(pattern, text)))

df_train['contains_ip'] = df_train['text'].apply(containsip)
df_test['contains_ip'] = df_test['text'].apply(containsip)

In [30]:
df_train['length'] = df_train['text'].apply(len)
df_test['length'] = df_test['text'].apply(len)

In [31]:
df_train

Unnamed: 0,text,label,contains_link,contains_ip,length
0,Free shipping jewelry accessory order today Up...,spam,0,0,63
1,206 165 76 175 ï ½ï ½ï ½ ï ½Sï ½ï ½ï ½ï ½ï ½ï ...,spam,0,0,411
2,______________________________________________...,spam,0,0,4002
3,Fantastic luxury item le half price Want Disco...,spam,0,0,56
4,Dreams achievable website like let enjoy life ...,spam,0,0,75
...,...,...,...,...,...
5995,Detailed step step instruction earnings Intern...,spam,0,0,66
5996,havingsomeone pulled pushing chicken Lotfi pum...,spam,0,0,3014
5997,Maybe may working Good day aÖ dult master 0Wou...,spam,0,0,261
5998,u307f u304d u3055 u3093 u304b u3089 u65b0 u774...,spam,0,0,2586


## Models 

We train two different models to see which of them performs better. Therefore we use the Support Vector Machine and the Naive Bayes Classifier.

### SVM - spam2 dataset

In [13]:
cv = CountVectorizer()
features = cv.fit_transform(df.text)

model = svm.SVC()
model.fit(features, df.label)

Test if the model detect our testMail.txt as a spam mail:

In [14]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['normal']


In [15]:
features_test = cv.transform(df.text)
print(model.score(features_test,df.label))

0.9967695620961953


The SVM model, that we trained with the spam2 dataset, has an accuracy of 0.99.

### SVM - train dataset

Training the model with the train dataset:

In [17]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:10000].text)

In [18]:
model = svm.SVC(degree=1)
model.fit(features,df_train.iloc[0:10000].label)

In [19]:
features_test = cv.transform(df_test.iloc[0:10000].text)
print(model.score(features_test,df_test.iloc[0:10000].label))

0.987


We test the model with the test dataset and get an accuracy of 0.98.

Training the model with the test data to compare if it has a better accuracy.

In [16]:
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]

features_test = cv.transform(df_test.text)
print(model.score(features_test,df_test.label))

0.04664433451485997


The testmail is now detected as spam:

In [20]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['spam']


### Gaussian Naive Bayes Classifier

We limit our the test and training dataset by using only the first 2000 lines for training because of the size of the dataset and therefore resulting performance issues. 
We train the model with the **train** data.

In [22]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:2000].text).toarray()
test_features = cv.transform(df_test.iloc[0:2000].text).toarray()

In [23]:
gnb = GaussianNB()
gnb = gnb.fit(features, df_train.iloc[0:2000].label)
y_pred = gnb.predict(test_features)

In [59]:
print("Number of mislabeled points out of a total %d points : %d" % (test_features.shape[0], (df_test.iloc[:2000].label != y_pred).sum()))

Number of mislabeled points out of a total 2000 points : 17


We also train the model with the **spam2** dataset to compare the predictions.

In [60]:
test_features = cv.transform(df.iloc[:2000].text).toarray()

In [61]:
y_pred = gnb.predict(test_features)

In [62]:
print("Number of mislabeled points out of a total %d points : %d" % (test_features.shape[0], (df_test.iloc[:2000].label != y_pred).sum()))

Number of mislabeled points out of a total 2000 points : 157


In [63]:
arr = gnb.predict_proba(test_features)

In [52]:
arr

array([[-1998248.74661218,        0.        ],
       [ -151849.39494892,        0.        ],
       [-4212733.68842394,        0.        ],
       ...,
       [ -152078.83560315,        0.        ],
       [-3475190.26864323,        0.        ],
       [ -151967.19980339,        0.        ]])

In [49]:
df_train.iloc[0]

text             Free shipping on all jewelry and accessories o...
label                                                         spam
contains_link                                                False
Name: 0, dtype: object

In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()
X = iris['data']
y = iris['target']

# Create a GaussianNB classifier
gnb = GaussianNB()

# Fit the classifier to the data
gnb.fit(X, y)

# Get the predicted probabilities for the first sample
probs = gnb.predict_proba([X[0]])[0]

# Print the predicted probabilities
print(probs)

[1.00000000e+00 1.35784265e-18 7.11283512e-26]


# Lineare Regression

Text-Analyse mit Linearer Regression

In [132]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:2000].text).toarray()
test_features = cv.transform(df_test.iloc[0:2000].text).toarray()

In [134]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)

In [135]:
reg = LinearRegression().fit(features, label)

In [144]:
reg.score(test_features, test_label)

-3.0893262944415882

In [149]:
reg = LogisticRegression().fit(features, label)
reg.score(test_features, test_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.992

In [192]:
reg.score(test_features, test_label)

0.14

Lineare Regression ohne Text-Analyse

In [150]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

data = df_train[["contains_link", "contains_ip", "length"]].iloc[:2000]
test_data = df_test[["contains_link", "contains_ip", "length"]].iloc[:2000]
label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)
data["prediction"] =  reg.predict(features)
test_data["prediction"] = reg.predict(test_features)

In [151]:
reg2 = LinearRegression().fit(data, label)

In [152]:
reg2.score(test_data, test_label)

0.628608423279085

In [126]:
data
data["prediction"] = pred

In [None]:
reg2.score(features, label)

In [None]:
Logistic_reg = LogisticRegression().fit(data, label)

Test mit alten Daten

In [184]:
test_features = cv.transform(df.iloc[0:2000].text).toarray()

In [188]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

test_label = df.iloc[:2000].label.apply(label_to_numerical)

In [189]:

Logistic_reg.score(test_data, test_label)

0.149

### Decision Tree Classifier

In [84]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.text)
test_features = cv.transform(df_test.text)
spam2_features = cv.transform((df.text))

In [98]:
DcsTree = DecisionTreeClassifier()
DcsTree.fit(features, df_train.label)

In [99]:
DcsTree.score(test_features, df_test.label)

0.9953333333333333

In [100]:
DcsTree.score(spam2_features, df.label)

0.15466666666666667

### Random Forest

In [88]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.text)
test_features = cv.transform(df_test.text)
spam2_features = cv.transform((df.text))

In [89]:
RndFrst = RandomForestClassifier()
RndFrst.fit(features, df_train.label)

##### Test mit selben Dataset

In [90]:
RndFrst.score(test_features, df_test.label)

0.9913333333333333

##### Test mit anderem Dataset

In [91]:
RndFrst.score(spam2_features, df.label)

0.15466666666666667

### RandomForest mit Gridsearch Fine-Tunen

In [92]:
# Define the parameters you want to search over
param_grid = {'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4, 5],
              'splitter': ["best", "random"]}

# Create a decision tree model
model = DecisionTreeClassifier()

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(features, df_train.label)

# Print the best hyperparameters found by the grid search
print(grid_search.best_params_)

{'max_depth': 1, 'min_samples_leaf': 1, 'splitter': 'best'}


In [101]:
DcsTree = DecisionTreeClassifier(max_depth = 1, min_samples_leaf = 1, splitter = 'best')
DcsTree.fit(features, df_train.label)
print(f"Score mit Testdaten aus dem selben Dataset: {DcsTree.score(test_features, df_test.label)}")
print(f"Score mit Testdaten aus dem alten Dataset: {DcsTree.score(spam2_features, df.label)}")

Score mit Testdaten aus dem selben Dataset: 0.994
Score mit Testdaten aus dem alten Dataset: 0.15466666666666667


In [103]:
# Define the parameters you want to search over
param_grid = {
                'max_depth': [1, 2, 3, 4, 5], 
                'min_samples_leaf': [1, 2, 3, 4, 5],}

# Create a decision tree model
model = RandomForestClassifier()

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(features, df_train.label)

# Print the best hyperparameters found by the grid search
print(grid_search.best_params_)

{'max_depth': 1, 'min_samples_leaf': 1}


In [104]:
RndFrst = RandomForestClassifier(max_depth = 1, min_samples_leaf = 1)
RndFrst.fit(features, df_train.label)
print(f"Score mit Testdaten aus dem selben Dataset: {RndFrst.score(test_features, df_test.label)}")
print(f"Score mit Testdaten aus dem alten Dataset: {RndFrst.score(spam2_features, df.label)}")

Score mit Testdaten aus dem selben Dataset: 0.986
Score mit Testdaten aus dem alten Dataset: 0.15466666666666667
