In [1]:
import pickle
import re
import pandas as pd
import numpy as np
import xgboost as xgb

from urllib.parse import urlparse
from tld import get_tld

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


## 1. Load dataset

In [2]:
df = pd.read_csv('../dataset/clean_malicious_phish.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,url,type
0,0,br-icloud.com.br,phishing
1,1,mp3raid.com/music/krizz_kaliko.html,safe
2,2,bopsecrets.org/rexroth/cr/1.htm,safe
3,3,http://www.garage-pirenne.be/index.php?option=...,phishing
4,4,http://adventure-nicaragua.net/index.php?optio...,phishing


In [4]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,safe
2,bopsecrets.org/rexroth/cr/1.htm,safe
3,http://www.garage-pirenne.be/index.php?option=...,phishing
4,http://adventure-nicaragua.net/index.php?optio...,phishing


In [5]:
df_1000 = df.head(1000)

In [6]:
df_1000.to_csv('../dataset/df_1000.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672857 entries, 0 to 672856
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     672857 non-null  object
 1   type    672857 non-null  object
dtypes: object(2)
memory usage: 10.3+ MB


## 2. Starting features engineering for models

#### 1. Here we change to number labels

In [8]:
df["type"] = df["type"].replace({"safe": 0, "phishing": 1})
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [9]:
df.to_csv('../dataset/bert.csv', index=False)

#### 2. And we apply a logic of analysis urls

In [10]:
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        return 1
    else:
        return 0


def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        return 1
    else:
        return 0

In [11]:
df['use_of_ip'] = df['url'].apply(lambda i: having_ip_address(i))
df['abnormal_url'] = df['url'].apply(lambda i: abnormal_url(i))

In [12]:
# def google_index(url):
#     site = search(url, 5)
#     return 1 if site else 0
# df['google_index'] = df['url'].apply(lambda i: google_index(i))
# def count_dot(url):
#     count_dot = url.count('.')
#     return count_dot

# df['count.'] = df['url'].apply(lambda i: count_dot(i))
# df.head()

In [13]:

def count_www(url):
    url.count('www')
    return url.count('www')

def count_point(url):
    return url.count('.')

def count_atrate(url):
    return url.count('@')


def count_https(url):
    return url.count('https')


def count_http(url):
    return url.count('http')


def count_per(url):
    return url.count('%')


def count_ques(url):
    return url.count('?')


def count_hyphen(url):
    return url.count('-')


def count_equal(url):
    return url.count('=')


def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')


def no_of_embed_domain(url):
    urldir = urlparse(url).path
    return urldir.count('//')


def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net', url)
    if match:
        return 1
    else:
        return 0

In [14]:
df['count-www'] = df['url'].apply(lambda i: count_www(i))
df['count-point'] = df['url'].apply(lambda i: count_point(i))
df['count@'] = df['url'].apply(lambda i: count_atrate(i))
df['count-https'] = df['url'].apply(lambda i : count_https(i))
df['count-http'] = df['url'].apply(lambda i : count_http(i))
df['count%'] = df['url'].apply(lambda i : count_per(i))
df['count?'] = df['url'].apply(lambda i: count_ques(i))
df['count-'] = df['url'].apply(lambda i: count_hyphen(i))
df['count='] = df['url'].apply(lambda i: count_equal(i))


df['count_dir'] = df['url'].apply(lambda i: no_of_dir(i))
df['count_embed_domain'] = df['url'].apply(lambda i: no_of_embed_domain(i))
df['short_url'] = df['url'].apply(lambda i: shortening_service(i))

In [15]:
def url_length(url):
    return len(str(url))


def hostname_length(url):
    return len(urlparse(url).netloc)


def suspicious_words(url):
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr|gift|win|giving', url)
    if match:
        return 1
    else:
        return 0


def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits


def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

In [16]:
def firstdirectory_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0


def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

In [17]:
df['url_length'] = df['url'].apply(lambda i: url_length(i))
df['hostname_length'] = df['url'].apply(lambda i: hostname_length(i))
df['sus_url'] = df['url'].apply(lambda i: suspicious_words(i))
df['count-digits']= df['url'].apply(lambda i: digit_count(i))
df['count-letters']= df['url'].apply(lambda i: letter_count(i))
df['fd_length'] = df['url'].apply(lambda i: firstdirectory_length(i))


df['tld'] = df['url'].apply(lambda i: get_tld(i,fail_silently=True))
df['tld_length'] = df['tld'].apply(lambda i: tld_length(i))

In [18]:
df.head()

Unnamed: 0,url,type,use_of_ip,abnormal_url,count-www,count-point,count@,count-https,count-http,count%,...,count_embed_domain,short_url,url_length,hostname_length,sus_url,count-digits,count-letters,fd_length,tld,tld_length
0,br-icloud.com.br,1,0,0,0,2,0,0,0,0,...,0,0,16,0,0,0,13,0,,-1
1,mp3raid.com/music/krizz_kaliko.html,0,0,0,0,2,0,0,0,0,...,0,0,35,0,0,1,29,5,,-1
2,bopsecrets.org/rexroth/cr/1.htm,0,0,0,0,2,0,0,0,0,...,0,0,31,0,0,1,25,7,,-1
3,http://www.garage-pirenne.be/index.php?option=...,1,0,1,1,3,0,0,1,0,...,0,0,88,21,0,7,63,9,be,2
4,http://adventure-nicaragua.net/index.php?optio...,1,0,1,0,2,0,0,1,0,...,0,0,235,23,0,22,199,9,net,3


In [19]:
df = df.drop("tld", axis=1)
df.columns

Index(['url', 'type', 'use_of_ip', 'abnormal_url', 'count-www', 'count-point',
       'count@', 'count-https', 'count-http', 'count%', 'count?', 'count-',
       'count=', 'count_dir', 'count_embed_domain', 'short_url', 'url_length',
       'hostname_length', 'sus_url', 'count-digits', 'count-letters',
       'fd_length', 'tld_length'],
      dtype='object')

In [20]:
df['type'].value_counts()

type
0    428080
1    244777
Name: count, dtype: int64

### Encoding

In [21]:
lb_make = LabelEncoder()
df["type_code"] = lb_make.fit_transform(df["type"])
df["type_code"].value_counts()

type_code
0    428080
1    244777
Name: count, dtype: int64

In [22]:
df.rename(columns={
    'count-www': 'count_www',
    'count-point': 'count_point',
    'count@': 'count_at',
    'count-https': 'count_https',
    'count-http': 'count_http',
    'count%': 'count_percent',
    'count?': 'count_question',
    'count-': 'count_dash',
    'count=': 'count_equal',
    'count-digits': 'count_digits',
    'count-letters': 'count_letters'
}, inplace=True)

# Vérifiez les colonnes après le renommage
print(df.columns)

Index(['url', 'type', 'use_of_ip', 'abnormal_url', 'count_www', 'count_point',
       'count_at', 'count_https', 'count_http', 'count_percent',
       'count_question', 'count_dash', 'count_equal', 'count_dir',
       'count_embed_domain', 'short_url', 'url_length', 'hostname_length',
       'sus_url', 'count_digits', 'count_letters', 'fd_length', 'tld_length',
       'type_code'],
      dtype='object')


In [23]:
X = df[['use_of_ip', 'abnormal_url', 'count_point', 'count_www', 'count_at',
        'count_dir', 'count_embed_domain', 'short_url', 'count_https',
        'count_http', 'count_percent', 'count_question', 'count_dash', 'count_equal', 
        'url_length', 'hostname_length', 'sus_url', 'fd_length', 'tld_length', 
        'count_digits', 'count_letters']]

# Target Variable
y = df['type_code']

In [24]:
df.to_csv('../dataset/db_model.csv', index=False)

## 🤖 Starting model 🤖

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2,shuffle=True, random_state=5)

#### 1. RandomForestClassifier

In [26]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_features='sqrt')
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test,y_pred_rf,target_names=['safe', 'phishing']))

score = metrics.accuracy_score(y_test, y_pred_rf)
print("accuracy:   %0.3f" % score)

              precision    recall  f1-score   support

        safe       0.97      0.98      0.98     85616
    phishing       0.97      0.95      0.96     48956

    accuracy                           0.97    134572
   macro avg       0.97      0.97      0.97    134572
weighted avg       0.97      0.97      0.97    134572

accuracy:   0.973


#### 2. XGboost

In [27]:
xgb_c = xgb.XGBClassifier(n_estimators= 100)
xgb_c.fit(X_train,y_train)
y_pred_x = xgb_c.predict(X_test)
print(classification_report(y_test,y_pred_x,target_names=['safe', 'phishing']))

score = metrics.accuracy_score(y_test, y_pred_x)
print("accuracy:   %0.3f" % score)

              precision    recall  f1-score   support

        safe       0.97      0.99      0.98     85616
    phishing       0.98      0.95      0.96     48956

    accuracy                           0.97    134572
   macro avg       0.97      0.97      0.97    134572
weighted avg       0.97      0.97      0.97    134572

accuracy:   0.972


### We can apply preprocessing functions

In [28]:
def main(url):
    
    status = []
    
    status.append(having_ip_address(url))
    status.append(abnormal_url(url))
    status.append(count_point(url))
    status.append(count_www(url))
    status.append(count_atrate(url))
    status.append(no_of_dir(url))
    status.append(no_of_embed_domain(url))
    
    status.append(shortening_service(url))
    status.append(count_https(url))
    status.append(count_http(url))
    
    status.append(count_per(url))
    status.append(count_ques(url))
    status.append(count_hyphen(url))
    status.append(count_equal(url))
    
    status.append(url_length(url))
    status.append(hostname_length(url))
    status.append(suspicious_words(url))
    status.append(digit_count(url))
    status.append(letter_count(url))
    status.append(firstdirectory_length(url))
    tld = get_tld(url,fail_silently=True)
      
    status.append(tld_length(tld))

    return status

In [29]:
def get_prediction_from_url(test_url):
    features_test = main(test_url)
    features_test = np.array(features_test).reshape((1, -1))
    pred = xgb_c.predict(features_test)
    if int(pred[0]) == 0:
        res="SAFE"
        return res
    elif int(pred[0]) == 1.0:
        res="PHISHING"
        return res

In [30]:
urls = ['titaniumcorporate.co.za','en.wikipedia.org/wiki/North_Dakota', 'zedelivery.fun/']
for url in urls:
     print(get_prediction_from_url(url))

PHISHING
SAFE
SAFE


In [31]:
with open('../src/api/model/model.pkl', 'wb') as f:
    pickle.dump(xgb_c, f)

## 🚧 Others models 🚧