# AI Research - Phishing Detection

## Import

In [146]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import wordcloud
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression

The datasets we work with in the further steps are read in. 

In [175]:
df = pd.read_csv("Datasets/spam2.csv")
df_train = pd.read_csv("Datasets/train.csv")
df_test = pd.read_csv("Datasets/test.csv")
df_valid = pd.read_csv("Datasets/validation.csv")

In the preprocessing step we lemmatize the dataset and remove stopwords. Therefore we use NLTK.

In [176]:
nltk.download ("wordnet")
nltk.download ("stopwords")
stopWords = set(stopwords.words('english'))
regexp = RegexpTokenizer('\w+')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Clean up old Dataframe

In [177]:
df = df.rename(columns={"v2": "text", "v1": "label"})
df.label = df.label.str.replace("ham", "normal")

## Preprocessing

We create a lemmatize function, which we apply on the mail texts in the spam2.csv dataset.

In [178]:
wnl = WordNetLemmatizer()
def Lemmatize(x):
    x = regexp.tokenize(x)
    text = ""
    for i in x:
        if i not in stopWords:
            lemm = wnl.lemmatize(i)
            text += lemm + " "
    return text

df.text = df.text.transform(Lemmatize)


In [179]:
df = df[["text", "label"]]

In [180]:
df_train = df_train.rename(columns={"sentence1": "text"})
df_test = df_test.rename(columns={"sentence1": "text"})

In [181]:
df_train = df_train.loc[np.logical_and(df_train.label.notnull(), df_train.text.notnull())]
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]
df = df.loc[np.logical_and(df.label.notnull(), df.text.notnull())]

In [182]:
df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

With the following function we check if the the texts contain links. 

In [71]:
def containslink(text):
  pattern = r"(http|ftp|https)://([\w-]+(?:(?:.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
  return int(bool(re.search(pattern, text)))

df_train['contains_link'] = df_train['text'].apply(containslink)
df_test['contains_link'] = df_test['text'].apply(containslink)


In [76]:
def containslink(text):
  pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
  return int(bool(re.search(pattern, text)))

df_train['contains_ip'] = df_train['text'].apply(containslink)
df_test['contains_ip'] = df_test['text'].apply(containslink)

In [77]:
df_train['length'] = df_train['text'].apply(len)
df_test['length'] = df_test['text'].apply(len)

In [78]:
df_train

Unnamed: 0,text,label,contains_link,contains_ip,length
0,Free shipping on all jewelry and accessories o...,spam,0,0,76
1,206.165.76.175\nï¿½ï¿½ï¿½~ï¿½Sï¿½ï¿½ï¿½ï¿½ï¿½ï...,spam,1,1,412
2,______________________________________________...,spam,0,0,4521
3,Fantastic luxury items at less than half the p...,spam,0,0,79
4,"Dreams are achievable with websites like ours,...",spam,0,0,118
...,...,...,...,...,...
120521,"TEXT, NOTES & Mark Kinkead-Weekes Mark, only s...",spam,0,0,4451
120522,Bathroom with lyle was beginning of them.\nWha...,spam,1,0,417
120523,M. Smullyan. Is it cannot few is really are a ...,spam,0,0,4035
120524,"Be in for a good surprise, reps come cheaper b...",spam,0,0,98


## Models 

We train two different models to see which of them performs better. Therefore we use the Support Vector Machine and the Naive Bayes Classifier.

### SVM

In [12]:

cv = CountVectorizer()
features = cv.fit_transform(df.text)

model = svm.SVC()
model.fit(features, df.label)

Test if the model detect our testMail.txt as a spam mail:

In [13]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['normal']


In [14]:
features_test = cv.transform(df.text)
print(model.score(features_test,df.label))

0.9967695620961953


In [15]:
df_test = df_test.loc[np.logical_and(df_test.label.notnull(), df_test.text.notnull())]

features_test = cv.transform(df_test.text)
print(model.score(features_test,df_test.label))

0.04664433451485997


Training the model with new data:

In [16]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:10000].text)

In [17]:
model = svm.SVC(degree=1)
model.fit(features,df_train.iloc[0:10000].label)

In [18]:

features_test = cv.transform(df_test.iloc[0:10000].text)
print(model.score(features_test,df_test.iloc[0:10000].label))

0.987


The testmail is now detected as spam:

In [19]:
f = open("testMail.txt", "r")

features_test = cv.transform(f)
# print(model.score(features_test,y_test))
print(model.predict(features_test))

['spam']


### Gaussian Naive Bayes Classifier

In [57]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:2000].text).toarray()
test_features = cv.transform(df_test.iloc[0:2000].text).toarray()

In [58]:
gnb = GaussianNB()
gnb = gnb.fit(features, df_train.iloc[0:2000].label)
y_pred = gnb.predict(test_features)

In [59]:
print("Number of mislabeled points out of a total %d points : %d" % (test_features.shape[0], (df_test.iloc[:2000].label != y_pred).sum()))


Number of mislabeled points out of a total 2000 points : 17


In [60]:
test_features = cv.transform(df.iloc[:2000].text).toarray()

In [61]:
y_pred = gnb.predict(test_features)

In [62]:
print("Number of mislabeled points out of a total %d points : %d" % (test_features.shape[0], (df_test.iloc[:2000].label != y_pred).sum()))

Number of mislabeled points out of a total 2000 points : 157


In [63]:
arr = gnb.predict_proba(test_features)

In [52]:
arr

array([[-1998248.74661218,        0.        ],
       [ -151849.39494892,        0.        ],
       [-4212733.68842394,        0.        ],
       ...,
       [ -152078.83560315,        0.        ],
       [-3475190.26864323,        0.        ],
       [ -151967.19980339,        0.        ]])

In [49]:
df_train.iloc[0]

text             Free shipping on all jewelry and accessories o...
label                                                         spam
contains_link                                                False
Name: 0, dtype: object

In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris

# Load the iris dataset
iris = load_iris()
X = iris['data']
y = iris['target']

# Create a GaussianNB classifier
gnb = GaussianNB()

# Fit the classifier to the data
gnb.fit(X, y)

# Get the predicted probabilities for the first sample
probs = gnb.predict_proba([X[0]])[0]

# Print the predicted probabilities
print(probs)

[1.00000000e+00 1.35784265e-18 7.11283512e-26]


# Lineare Regression

Text-Analyse mit Linearer Regression

In [132]:
cv = CountVectorizer()
features = cv.fit_transform(df_train.iloc[0:2000].text).toarray()
test_features = cv.transform(df_test.iloc[0:2000].text).toarray()

In [134]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)

In [135]:
reg = LinearRegression().fit(features, label)

In [144]:
reg.score(test_features, test_label)

-3.0893262944415882

In [149]:
reg = LogisticRegression().fit(features, label)
reg.score(test_features, test_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.992

In [192]:
reg.score(test_features, test_label)

0.14

Lineare Regression ohne Text-Analyse

In [150]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

data = df_train[["contains_link", "contains_ip", "length"]].iloc[:2000]
test_data = df_test[["contains_link", "contains_ip", "length"]].iloc[:2000]
label = df_train.iloc[:2000].label.apply(label_to_numerical)
test_label = df_test.iloc[:2000].label.apply(label_to_numerical)
data["prediction"] =  reg.predict(features)
test_data["prediction"] = reg.predict(test_features)

In [151]:
reg2 = LinearRegression().fit(data, label)

In [152]:
reg2.score(test_data, test_label)

0.628608423279085

In [126]:
data
data["prediction"] = pred

In [None]:
reg2.score(features, label)

In [None]:
Logistic_reg = LogisticRegression().fit(data, label)

Test mit alten Daten

In [184]:
test_features = cv.transform(df.iloc[0:2000].text).toarray()

In [188]:
def label_to_numerical(label):
    if label == "spam":
        return 0
    if label == "normal":
        return 1
    else:
        return np.NAN

test_label = df.iloc[:2000].label.apply(label_to_numerical)

In [189]:

Logistic_reg.score(test_data, test_label)

0.149