<a href="https://colab.research.google.com/github/maleepicface/brianhelenfakenews/blob/main/Final_URL_Classifier_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install whois
!pip install pyquery
!pip install tqdm

Collecting whois
  Downloading whois-0.9.13.tar.gz (9.2 kB)
Building wheels for collected packages: whois
  Building wheel for whois (setup.py) ... [?25l[?25hdone
  Created wheel for whois: filename=whois-0.9.13-py3-none-any.whl size=9609 sha256=d752d155fd95259bfd19ee4925927ed708e1a3f74c431ad88e292f052252b334
  Stored in directory: /root/.cache/pip/wheels/f5/85/a6/2ac688355ea1a85b0a2b9c0243012f3021333b6c65b7305d86
Successfully built whois
Installing collected packages: whois
Successfully installed whois-0.9.13
Collecting pyquery
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting cssselect>0.7.9
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: cssselect, pyquery
Successfully installed cssselect-1.1.0 pyquery-1.4.3


In [14]:
!pip install interruptingcow
from interruptingcow import timeout

Collecting interruptingcow
  Downloading interruptingcow-0.8.tar.gz (5.0 kB)
Building wheels for collected packages: interruptingcow
  Building wheel for interruptingcow (setup.py) ... [?25l[?25hdone
  Created wheel for interruptingcow: filename=interruptingcow-0.8-py3-none-any.whl size=5822 sha256=e6a349f1b7a5f2cac27a5202f08e4cdc0254081f69a7c8f0ab55ae1258c7a356
  Stored in directory: /root/.cache/pip/wheels/fe/83/45/99297454c39c203890ab65c08db879940862778921cc61a416
Successfully built interruptingcow
Installing collected packages: interruptingcow
Successfully installed interruptingcow-0.8


In [3]:
import whois
from datetime import datetime, timezone
import math
import pandas as pd
import numpy as np
from pyquery import PyQuery
from requests import get

class UrlFeaturizer(object):
    def __init__(self, url):
        self.url = url
        self.domain = url.split('//')[-1].split('/')[0]
        self.today = datetime.now().replace(tzinfo=None)

        try:
            self.whois = whois.query(self.domain).__dict__
        except:
            self.whois = None

        try:
            self.response = get(self.url)
            self.pq = PyQuery(self.response.text)
        except:
            self.response = None
            self.pq = None

    ## URL string Features
    def entropy(self):
        string = self.url.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def ip(self):
        string = self.url
        flag = False
        if ("." in string):
            elements_array = string.strip().split(".")
            if(len(elements_array) == 4):
                for i in elements_array:
                    if (i.isnumeric() and int(i)>=0 and int(i)<=255):
                        flag=True
                    else:
                        flag=False
                        break
        if flag:
            return 1 
        else:
            return 0

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)

    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1

    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1

    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext

    ## URL domain features
    def hasHttp(self):
        return 'http:' in self.url

    def hasHttps(self):
        return 'https:' in self.url

    def daysSinceRegistration(self):
        if self.whois and self.whois['creation_date']:
            diff = self.today - self.whois['creation_date'].replace(tzinfo=None)
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    def daysSinceExpiration(self):
        if self.whois and self.whois['expiration_date']:
            diff = self.whois['expiration_date'].replace(tzinfo=None) - self.today
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0
    
     ## URL Page Features
    def bodyLength(self):
        if self.pq is not None:
            return len(self.pq('html').text()) if self.urlIsLive else 0
        else:
            return 0

    def numTitles(self):
        if self.pq is not None:
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            return len([item for s in titles for item in s])
        else:
            return 0

    def numImages(self):
        if self.pq is not None:
            return len([i for i in self.pq('img').items()])
        else:
            return 0

    def numLinks(self):
        if self.pq is not None:
            return len([i for i in self.pq('a').items()])
        else:
            return 0

    def scriptLength(self):
        if self.pq is not None:
            return len(self.pq('script').text())
        else:
            return 0

    def specialCharacters(self):
        if self.pq is not None:
            bodyText = self.pq('html').text()
            schars = [i for i in bodyText if not i.isdigit() and not i.isalpha()]
            return len(schars)
        else:
            return 0

    def scriptToSpecialCharsRatio(self):
        v = self.specialCharacters()
        if self.pq is not None and v!=0:
            sscr = self.scriptLength()/v
        else:
            sscr = 0
        return sscr

    def scriptTobodyRatio(self):
        v = self.bodyLength()
        if self.pq is not None and v!=0:
            sbr = self.scriptLength()/v
        else:
            sbr = 0
        return sbr

    def bodyToSpecialCharRatio(self):
        v = self.bodyLength()
        if self.pq is not None and v!=0:
            bscr = self.specialCharacters()/v
        else:
            bscr = 0
        return bscr

    def urlIsLive(self):
        return self.response == 200

    def run(self):
        data = {}
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
        data['urlIsLive'] = self.urlIsLive()
        data['bodyLength'] = self.bodyLength()
        data['numTitles'] = self.numTitles()
        data['numImages'] = self.numImages()
        data['numLinks'] = self.numLinks()
        data['scriptLength'] = self.scriptLength()
        data['specialChars'] = self.specialCharacters()
        data['ext'] = self.domainExtension()
        data['dsr'] = self.daysSinceRegistration()
        data['dse'] = self.daysSinceExpiration()
        data['sscr'] = self.scriptToSpecialCharsRatio()
        data['sbr'] = self.scriptTobodyRatio()
        data['bscr'] = self.bodyToSpecialCharRatio()
        data['num_%20'] = self.url.count("%20")
        data['num_@'] = self.url.count("@")
        data['has_ip'] = self.ip()
    
        return data

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

!pip install requests

import requests



In [82]:
l = ['/content/drive/MyDrive/feature_data.csv']

# emp = UrlFeaturizer("").run().keys()

# A = pd.DataFrame(columns = emp)
# t=[]
# for j in l:
#     print(j)
#     d=pd.read_csv(j,header=None).to_numpy().flatten()
#     print(d)
#     #for i in d:
#     for i in tqdm(d):
#         try: 
#             with timeout(30, exception = RuntimeError):  
#                 temp=UrlFeaturizer(i).run()
#                 temp["File"]=j.split(".")[0]
#                 t.append(temp)
#         except RuntimeError: 
#             pass 
# A=A.append(t)
#os.chdir('../')
#A.to_csv("features12.csv")

data = pd.read_csv(l[0], index_col=0)
data.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,File,bodyLength,bscr,dse,dsr,entropy,ext,hasHttp,hasHttps,has_ip,numDigits,numImages,numLinks,numParams,numTitles,num_%20,num_@,sbr,scriptLength,specialChars,sscr,urlIsLive,urlLength
0,spam_dataset,31,0.064516,0,0,-4.708307,uk,True,False,0,27,1,0,0,0,0,0,0.0,0,2,0.0,False,81
1,spam_dataset,64790,0.30605,0,0,-4.576882,html,True,False,0,12,6,183,0,19,0,0,0.9238,59853,19829,3.018458,False,58
2,spam_dataset,757,0.195509,0,0,-4.205536,html,True,False,0,0,6,8,0,1,0,0,0.0,0,148,0.0,False,46
3,spam_dataset,266310,0.230915,0,0,-4.550931,html,True,False,0,13,3,831,0,648,0,0,0.312647,83261,61495,1.353947,False,52
4,spam_dataset,40,0.2,0,0,-4.689336,php?cPath=193_195_197&products_id=1395&op=list,True,False,0,14,0,0,2,1,0,0,0.0,0,8,0.0,False,83


In [83]:
features = UrlFeaturizer("https://classroom.google.com/u/1/h").run()
feature_keys = features.keys()

In [84]:
import tensorflow as tf
import keras
import pandas as pd
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, LSTM, Embedding, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from numpy import unique
from tensorflow.keras import layers
from tensorflow.math import confusion_matrix

In [120]:
input_dim = len(data.columns)

model = Sequential()    
model.add(Dense(256, input_dim = input_dim , activation = 'relu'))    
model.add(Dense(128, activation = 'relu'))    
model.add(Dense(64, activation = 'relu'))    
model.add(Dense(32, activation = 'relu'))    
model.add(Dense(16, activation = 'relu'))    
model.add(Dense(3, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy' ,optimizer='adam' , metrics = ['accuracy', 'Recall', 'Precision'] )

In [121]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [122]:
data = pd.read_csv('/content/drive/MyDrive/feature_data.csv')
data.replace(True,1,inplace = True)    
data.replace(False,0,inplace = True)
data

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,File,bodyLength,bscr,dse,dsr,entropy,ext,hasHttp,hasHttps,has_ip,numDigits,numImages,numLinks,numParams,numTitles,num_%20,num_@,sbr,scriptLength,specialChars,sscr,urlIsLive,urlLength
0,0,spam_dataset,31,0.064516,0,0,-4.708307,uk,1.0,0.0,0,27,1,0,0,0,0,0,0.000000,0,2,0.000000,0.0,81
1,1,spam_dataset,64790,0.306050,0,0,-4.576882,html,1.0,0.0,0,12,6,183,0,19,0,0,0.923800,59853,19829,3.018458,0.0,58
2,2,spam_dataset,757,0.195509,0,0,-4.205536,html,1.0,0.0,0,0,6,8,0,1,0,0,0.000000,0,148,0.000000,0.0,46
3,3,spam_dataset,266310,0.230915,0,0,-4.550931,html,1.0,0.0,0,13,3,831,0,648,0,0,0.312647,83261,61495,1.353947,0.0,52
4,4,spam_dataset,40,0.200000,0,0,-4.689336,php?cPath=193_195_197&products_id=1395&op=list,1.0,0.0,0,14,0,0,2,1,0,0,0.000000,0,8,0.000000,0.0,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54526,9996,Defacement,33,0.121212,0,0,-4.755244,tpl&product_id=336&option=com_virtuemart&Itemi...,1.0,0.0,0,8,0,0,5,1,0,0,0.000000,0,4,0.000000,0.0,155
54527,9997,Defacement,862930,0.207603,0,0,-4.419173,html,1.0,0.0,0,6,2,22,0,3,0,0,0.011485,9911,179147,0.055323,0.0,79
54528,9998,Defacement,183,0.185792,0,0,-4.196164,com,1.0,0.0,0,3,0,0,0,1,0,0,0.000000,0,34,0.000000,0.0,65
54529,9999,Defacement,0,0.000000,0,0,-3.896292,jpg,1.0,0.0,0,2,0,0,0,0,0,0,0.000000,0,0,0.000000,0.0,28


In [123]:
data = pd.read_csv('/content/drive/MyDrive/feature_data.csv')
data.replace(True,1,inplace = True)    
data.replace(False,0,inplace = True)
    
  

# remove the bad data point with long string
data = data[data.dse.str.len()<10]   
data = data[data.dsr.str.len()<10]  


y = data["File"]  
data = data.drop(columns = ["File", 'ext', "Unnamed: 0"])
encoder = LabelEncoder()    
encoder.fit(y)    
Y = encoder.transform(y)  
   
scaler = MinMaxScaler(feature_range=(0, 1))    

#X = data.astype(float)

X = pd.DataFrame(scaler.fit_transform(data))


  interactivity=interactivity, compiler=compiler, result=result)


In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [96]:
from keras.utils.np_utils import to_categorical

In [126]:
model.fit(X_train,to_categorical(y_train), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f748f4decd0>

In [128]:
model.fit(X_train,to_categorical(y_train), epochs = 10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f748f4a9790>

In [76]:
y

0        spam_dataset
1        spam_dataset
2        spam_dataset
3        spam_dataset
4        spam_dataset
             ...     
54526      Defacement
54527      Defacement
54528      Defacement
54529      Defacement
54530      Defacement
Name: File, Length: 54531, dtype: object

In [81]:
data

Unnamed: 0.1,Unnamed: 0,bodyLength,bscr,dse,dsr,entropy,ext,hasHttp,hasHttps,has_ip,numDigits,numImages,numLinks,numParams,numTitles,num_%20,num_@,sbr,scriptLength,specialChars,sscr,urlIsLive,urlLength
0,0,31,0.064516,0,0,-4.708307,uk,1.0,0.0,0,27,1,0,0,0,0,0,0.000000,0,2,0.000000,0.0,81
1,1,64790,0.306050,0,0,-4.576882,html,1.0,0.0,0,12,6,183,0,19,0,0,0.923800,59853,19829,3.018458,0.0,58
2,2,757,0.195509,0,0,-4.205536,html,1.0,0.0,0,0,6,8,0,1,0,0,0.000000,0,148,0.000000,0.0,46
3,3,266310,0.230915,0,0,-4.550931,html,1.0,0.0,0,13,3,831,0,648,0,0,0.312647,83261,61495,1.353947,0.0,52
4,4,40,0.200000,0,0,-4.689336,php?cPath=193_195_197&products_id=1395&op=list,1.0,0.0,0,14,0,0,2,1,0,0,0.000000,0,8,0.000000,0.0,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54526,9996,33,0.121212,0,0,-4.755244,tpl&product_id=336&option=com_virtuemart&Itemi...,1.0,0.0,0,8,0,0,5,1,0,0,0.000000,0,4,0.000000,0.0,155
54527,9997,862930,0.207603,0,0,-4.419173,html,1.0,0.0,0,6,2,22,0,3,0,0,0.011485,9911,179147,0.055323,0.0,79
54528,9998,183,0.185792,0,0,-4.196164,com,1.0,0.0,0,3,0,0,0,1,0,0,0.000000,0,34,0.000000,0.0,65
54529,9999,0,0.000000,0,0,-3.896292,jpg,1.0,0.0,0,2,0,0,0,0,0,0,0.000000,0,0,0.000000,0.0,28


In [80]:
X

Unnamed: 0.1,Unnamed: 0,bodyLength,bscr,dse,dsr,entropy,ext,hasHttp,hasHttps,has_ip,numDigits,numImages,numLinks,numParams,numTitles,num_%20,num_@,sbr,scriptLength,specialChars,sscr,urlIsLive,urlLength
0,0,31,0.064516,0,0,-4.708307,uk,1.0,0.0,0,27,1,0,0,0,0,0,0.000000,0,2,0.000000,0.0,81
1,1,64790,0.306050,0,0,-4.576882,html,1.0,0.0,0,12,6,183,0,19,0,0,0.923800,59853,19829,3.018458,0.0,58
2,2,757,0.195509,0,0,-4.205536,html,1.0,0.0,0,0,6,8,0,1,0,0,0.000000,0,148,0.000000,0.0,46
3,3,266310,0.230915,0,0,-4.550931,html,1.0,0.0,0,13,3,831,0,648,0,0,0.312647,83261,61495,1.353947,0.0,52
4,4,40,0.200000,0,0,-4.689336,php?cPath=193_195_197&products_id=1395&op=list,1.0,0.0,0,14,0,0,2,1,0,0,0.000000,0,8,0.000000,0.0,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54526,9996,33,0.121212,0,0,-4.755244,tpl&product_id=336&option=com_virtuemart&Itemi...,1.0,0.0,0,8,0,0,5,1,0,0,0.000000,0,4,0.000000,0.0,155
54527,9997,862930,0.207603,0,0,-4.419173,html,1.0,0.0,0,6,2,22,0,3,0,0,0.011485,9911,179147,0.055323,0.0,79
54528,9998,183,0.185792,0,0,-4.196164,com,1.0,0.0,0,3,0,0,0,1,0,0,0.000000,0,34,0.000000,0.0,65
54529,9999,0,0.000000,0,0,-3.896292,jpg,1.0,0.0,0,2,0,0,0,0,0,0,0.000000,0,0,0.000000,0.0,28
