In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
urls = pd.read_csv("malicious_phish.csv")
urls["url"] = urls["url"].apply(lambda x: x.strip())
urls.head()

def mark_as_malicious(name):
    if name == "benign":
        return 0
    else:
        return 1

urls["type"] = urls["type"].apply(mark_as_malicious)
urls.rename(columns={"type": "is_malicious"}, inplace=True)
urls.head()

Unnamed: 0,url,is_malicious
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [3]:
# Feature Extraction
# As referenced in this article: https://towardsdatascience.com/predicting-the-maliciousness-of-urls-24e12067be5

def entropy(url):
    prob = [float(url.count(c)) / len(url) for c in dict.fromkeys(list(url))]
    entropy = sum([(p * np.log(p) / np.log(2.0)) for p in prob])
    return entropy

def count_digits(url):
    return sum(c.isdigit() for c in url)

def length(url):
    return len(url)

def num_params(url):
    split = url.split('?')
    if len(split) > 1:
        return len(split[1].split('&'))
    else:
        return 0
    
def num_frags(url):
    return len(url.split('#')) - 1

def has_HTTP(url):
    if "http:" in url:
        return 1
    else:
        return 0

urls["entropy"] = urls["url"].apply(entropy)
urls["# of digits"] = urls["url"].apply(count_digits)
urls["length"] = urls["url"].apply(length)
urls["# of params"] = urls["url"].apply(num_params)
urls["# of fragments"] = urls["url"].apply(num_frags)
urls["has_HTTP"] = urls["url"].apply(has_HTTP)
urls.head()

X = urls.drop(["url", "is_malicious"], axis=1)
y = urls["is_malicious"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(urls["url"])
y = urls["is_malicious"]

selector = SelectKBest(chi2, k=100)
X = selector.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [4]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90    141628
           1       0.87      0.68      0.76     73266

    accuracy                           0.86    214894
   macro avg       0.86      0.81      0.83    214894
weighted avg       0.86      0.86      0.85    214894



In [None]:
def test(url, model):
    df = vectorizer.transform([url])
    df_selected = selector.transform(df)
    return model.predict(df_selected)

In [None]:
def test(url, model):
    data = {
        "entropy": [entropy(url)], 
        "# of digits": [count_digits(url)],
        "length" : [length(url)],
        "# of params" : [num_params(url)],
        "# of fragments": [num_frags(url)],
        "has_HTTP" : [has_HTTP(url)]
        }
    print(data)
    df = pd.DataFrame(data)
    return model.predict(df)

In [None]:
#test a single URL
test_url = "https://github.com/NUS-FinTechLab/nce-frontend"
print(test(test_url, dt))

In [None]:
test_urls = pd.read_csv("verified_online.csv")["url"]
results = test_urls.apply(lambda x : test(x, dt))
print(results.sum())

In [None]:
urls.to_csv("malicious_phish_cleaned.csv", index=False)

In [5]:
import pickle

filename = "url_classifier.sav"
pickle.dump(dt, open(filename, "wb"))
print("Decision Tree Model Saved")

Decision Tree Model Saved
