In [182]:
import pandas as pd
import numpy as np
import klib
from sklearn.utils import shuffle
import validators

In [183]:
# load the data from csv file
df = pd.read_csv('urldata.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


# Data Preprocessing

In [184]:
# shuffle data
df = shuffle(df)
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
119315,119315,https://www.wn.com/Interstate_10,benign,0
253379,253379,https://www.musicstack.com/album/doobie+brothe...,benign,0
259413,259413,https://www.myspace.com/suicidekingfilms,benign,0
422589,422589,http://www.arrowlife.com/Ourtimela/ourtime/v3/...,malicious,1
433205,433205,http://yellohm.com/wp-content/uploads/2017/12/...,malicious,1


In [185]:
# rename index column
df = df.rename(columns={"Unnamed: 0":"index"})
df.head()

Unnamed: 0,index,url,label,result
119315,119315,https://www.wn.com/Interstate_10,benign,0
253379,253379,https://www.musicstack.com/album/doobie+brothe...,benign,0
259413,259413,https://www.myspace.com/suicidekingfilms,benign,0
422589,422589,http://www.arrowlife.com/Ourtimela/ourtime/v3/...,malicious,1
433205,433205,http://yellohm.com/wp-content/uploads/2017/12/...,malicious,1


In [186]:
df.shape

(450176, 4)

In [187]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450176 entries, 119315 to 223050
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   index   450176 non-null  int64 
 1   url     450176 non-null  object
 2   label   450176 non-null  object
 3   result  450176 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 17.2+ MB


In [188]:
# using klib to convert existing dtypes to more efficient dtypes
df = klib.convert_datatypes(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450176 entries, 119315 to 223050
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   index   450176 non-null  int32   
 1   url     450176 non-null  string  
 2   label   450176 non-null  category
 3   result  450176 non-null  int8    
dtypes: category(1), int32(1), int8(1), string(1)
memory usage: 9.4 MB


In [189]:
df.isnull().sum().sum()

0

In [190]:
df['url'].nunique() 

450176

In [191]:
df['result'].value_counts()

0    345738
1    104438
Name: result, dtype: int64

In [192]:
# Class count
count_class_0, count_class_1 = df.result.value_counts()

# Divide by class
df_class_0 = df[df['result'] == 0]
df_class_1 = df[df['result'] == 1]

In [193]:
# oversample 1-class and concat the DataFrames of both classes

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_over.result.value_counts())

Random over-sampling:
0    345738
1    345738
Name: result, dtype: int64


In [194]:
df_over = shuffle(df_over)

In [195]:
X = df_over['url']
y = df_over['result']

# text preprocessing

In [196]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [197]:
text = X

In [198]:
Tokenizer = Tokenizer()

In [199]:
Tokenizer.fit_on_texts(text) 
Tokenizer_vocab_size = len(Tokenizer.word_index)+1
Tokenizer_vocab_size

416143

In [200]:
X.shape, y.shape

((691476,), (691476,))

In [201]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

In [202]:
X_train.shape, X_val.shape

((414885,), (276591,))

In [203]:
X_train_encoded_words = Tokenizer.texts_to_sequences(X_train)
X_val_encoded_words = Tokenizer.texts_to_sequences(X_val)

In [204]:
X_train_encoded_padded_words = sequence.pad_sequences(X_train_encoded_words, maxlen=100) 
X_val_encoded_padded_words = sequence.pad_sequences(X_val_encoded_words, maxlen=100)

X_train_encoded_padded_words.shape, X_val_encoded_padded_words.shape

((414885, 100), (276591, 100))

In [205]:
X_train_encoded_padded_words, X_val_encoded_padded_words

(array([[     0,      0,      0, ..., 175951,      1,    687],
        [     0,      0,      0, ...,      1,      9,      5],
        [     0,      0,      0, ...,    249,   2345, 273748],
        ...,
        [     0,      0,      0, ...,  38769,    213,  89329],
        [     0,      0,      0, ...,    153,   7399,      6],
        [     0,      0,      0, ...,   4862,  14702,    427]]),
 array([[     0,      0,      0, ...,   2257,     81,  26501],
        [     0,      0,      0, ...,   1625,      1, 185777],
        [     0,      0,      0, ...,   1274, 107017,      6],
        ...,
        [     0,      0,      0, ...,   4690, 247817,   1643],
        [     0,      0,      0, ...,  53127,     30,  94346],
        [     0,      0,      0, ...,  69022,      5,     53]]))

# Model

In [206]:
# xgboost
from xgboost import XGBClassifier
from sklearn.metrics  import accuracy_score

model = XGBClassifier().fit(X_train_encoded_padded_words , y_train)





In [207]:
ypred_xgb = model.predict(X_val_encoded_padded_words)

In [208]:
np.array(y_val.head())

array([0, 0, 0, 0, 1], dtype=int8)

In [209]:
ypred_xgb[0:5]

array([0, 0, 0, 0, 1], dtype=int8)

In [210]:
accuracy = accuracy_score(y_val, ypred_xgb)
print("accuracy: {:.2f}".format(accuracy*100))

accuracy: 99.67


In [211]:
# logistic regression
from sklearn.linear_model import LogisticRegression

logi = LogisticRegression().fit(X_train_encoded_padded_words , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [212]:
ypred_logistic = logi.predict(X_val_encoded_padded_words)

In [213]:
ypred_logistic[0:5]

array([0, 0, 0, 0, 0], dtype=int8)

In [214]:
np.array(y_val.head())

array([0, 0, 0, 0, 1], dtype=int8)

In [215]:
accuracy = accuracy_score(y_val, ypred_logistic)
print("accuracy: {:.2f}".format(accuracy*100))

accuracy: 50.93


In [216]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier().fit(X_train_encoded_padded_words , y_train)

In [217]:
ypred_gbc = gbc.predict(X_val_encoded_padded_words)

In [218]:
ypred_gbc[0:5]

array([0, 0, 0, 0, 1], dtype=int8)

In [219]:
np.array(y_val.head())

array([0, 0, 0, 0, 1], dtype=int8)

In [220]:
accuracy = accuracy_score(y_val, ypred_gbc)
print("accuracy: {:.2f}".format(accuracy*100))

accuracy: 98.81


# Predictions

In [221]:
# save the model and tokenizer
import pickle
pickle.dump(Tokenizer, open('models/tokenizer.pkl','wb'))
pickle.dump(model, open('models/xgb_model.pkl','wb'))
pickle.dump(logi, open('models/logistic.pkl','wb'))
pickle.dump(gbc, open('models/gbc.pkl','wb'))

In [222]:
# load the saved model  and tokenizer
Tokenizer = pickle.load(open('models/tokenizer.pkl','rb'))
saved_model = pickle.load(open('models/xgb_model.pkl','rb'))
saved_model1 = pickle.load(open('models/logistic.pkl','rb'))
saved_model2 = pickle.load(open('models/gbc.pkl','rb'))

In [223]:
def predict(text):
    url = text
    valid = validators.url(url)
    if(valid==True):
        
        tokens = Tokenizer.texts_to_sequences([url])
        tokens = sequence.pad_sequences(tokens, maxlen=100)
        pred = saved_model2.predict(np.array(tokens),)
        classes = ['Benign url','Malicious url']
        result = classes[pred[0]]
        return result
    else:
        return ("Entered url is Invalid")

In [224]:
predict("https://www.youtube.com/")

'Benign url'

In [225]:
predict("https://www.w3schools.com/sql/sql_orderby.asp")

'Benign url'

In [226]:
predict("https://insidethestorex.com/sd/")

'Malicious url'

In [227]:
predict("http://i_am_not_url.com")

'Entered url is Invalid'

In [228]:
predict("https://1fichier.com/?0ewdpuct0nz0ck46vi6h")

'Malicious url'

In [229]:
predict("https://skidrowreloaded.ufile.io/k1mfa3fi")

'Malicious url'