In [52]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from urllib.parse import urlparse
import re
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tld import get_tld

# Load Data

In [53]:
df = pd.read_csv('Training Data/Phishing_Mitre_Dataset_Summer_of_AI.csv')
df

Unnamed: 0,create_age(months),expiry_age(months),update_age(days),URL,Label
0,-1,-1,-1,http://account-google-com.ngate.my/c44cca40176...,1
1,212,16,663,http://www.coffeespecialties.com/...,0
2,-1,-1,-1,http://black.pk/wp-content/2013/04/bp.postale/...,1
3,198,6,186,http://atomicsoda.com/manutd...,0
4,240,24,1684,http://bostoncoffeecake.com/...,0
...,...,...,...,...,...
4794,52,8,118,http://aridfoods.com/V4/MGen/F97a8a294cf7c5e90...,1
4795,-1,-1,-1,http://www.mazda.co.jp/...,0
4796,-1,-1,-1,http://www.fotografaemsaopaulo.com.br/wp-admin...,1
4797,-1,-1,-1,http://agenda.wehrensarl.ch/libraries/joomla/h...,1


# Feature Creation

## URL Length

In [54]:
df['url_length'] = df['URL'].apply(len)

## URL Entropy

In [55]:
def entropy(url_string):
        string = url_string.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return abs(entropy)

df['url_entropy'] = df['URL'].apply(entropy)

## Third Level Domain

In [56]:
def has_www(url_string):
    if (url_string.find('www.') == -1):
        return False
    else:
        return True
    
def find_between(url_string):
    try:
        return get_tld(url_string, fix_protocol=True)
    except:
        return ''

df['3rd_level_domain'] = df['URL'].apply(has_www)

## Non Alpha Characters

In [57]:
df['count_non_alpha'] = df['URL'].str.findall(r'[^a-zA-Z]').str.len()

# One Hot Encode and Drop Unnecessary Columns

In [58]:
df = pd.get_dummies(df, columns=['3rd_level_domain'])

In [51]:
y_df = df['Label']
X_df = df.drop(['Label','URL'],axis=1)
X_df

Unnamed: 0,create_age(months),expiry_age(months),update_age(days),url_length,url_entropy,3rd_level_domain_False,3rd_level_domain_True,count_non_alpha
0,-1,-1,-1,70,4.612783,1,0,32
1,212,16,663,36,3.819114,0,1,9
2,-1,-1,-1,73,4.370385,1,0,35
3,198,6,186,31,3.695501,1,0,8
4,240,24,1684,31,3.760017,1,0,8
...,...,...,...,...,...,...,...,...
4794,52,8,118,73,4.742582,1,0,31
4795,-1,-1,-1,26,3.507380,0,1,10
4796,-1,-1,-1,73,4.579566,0,1,21
4797,-1,-1,-1,73,4.441882,1,0,18


# Split Data

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, train_size=0.8, test_size=0.2)

In [42]:
X_train

Unnamed: 0,create_age(months),expiry_age(months),update_age(days),URL,url_length,url_entropy,3rd_level_domain_False,3rd_level_domain_True,count_non_alpha
807,226,14,8,http://secure.square.loginidqx.usa.cc/...,41,4.054481,1,0,11
3265,-1,-1,-1,http://www.nichuizen.co.za/errors/sess/6fcfacd...,73,4.583545,0,1,29
2426,19,5,576,http://service-paypal-information.sweddy.com/S...,73,4.876478,1,0,16
3800,24,12,480,http://citibuildersgroup.com/account-updating-...,73,4.517042,1,0,18
2182,165,63,303,http://broadwayworld.com/...,28,3.824863,1,0,8
...,...,...,...,...,...,...,...,...,...
4608,99,9,104,http://practical-management.com/...,35,3.864578,1,0,9
4761,-1,-1,-1,http://www.eloahair.com.br/Zone1/...,36,3.974938,0,1,12
4242,30,6,169,http://kingjchild.com/mu.php...,31,3.861477,1,0,9
1716,196,8,133,http://www.orpheuscomputing.com/PCworks-comput...,72,4.303567,0,1,14


In [43]:
X_test

Unnamed: 0,create_age(months),expiry_age(months),update_age(days),URL,url_length,url_entropy,3rd_level_domain_False,3rd_level_domain_True,count_non_alpha
2584,262,2,326,http://www.iop.org/EJ/journal/0031-9155...,42,4.327340,0,1,20
1467,194,46,81,http://maths4cet.150m.com/...,29,3.715932,1,0,13
824,226,14,8,http://hiroba.dqx.jp.zicrl.usa.cc/account/app/...,63,4.222226,1,0,16
3182,-1,-1,-1,http://kor.nkhumanrights.or.kr/campaign/region...,73,4.574572,1,0,20
4452,159,21,114,http://www.doneex.com/...,25,3.559080,0,1,9
...,...,...,...,...,...,...,...,...,...
646,197,19,217,http://www.balmzara.com/...,27,3.648432,0,1,9
4268,184,8,114,http://www.trainingpia.com/flash/uk...,38,4.073475,0,1,10
3904,109,11,33,http://www.heathervictoriaheld.com/wp-includes...,64,4.309172,0,1,12
379,3,9,3,http://fasonery.net/machine...,30,3.872906,1,0,8


In [44]:
y_train

807     1
3265    1
2426    1
3800    1
2182    0
       ..
4608    0
4761    1
4242    1
1716    0
3776    0
Name: Label, Length: 3839, dtype: int64

In [45]:
y_test

2584    0
1467    0
824     1
3182    1
4452    0
       ..
646     0
4268    1
3904    1
379     1
3078    0
Name: Label, Length: 960, dtype: int64