In [255]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [239]:
data = pd.read_csv('data/data.csv')
data.describe()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.0055,0.0226,0.7734,3.072,0.0135,0.0002,0.0903,0.0932,0.1008,0.8457,0.4137,0.8099,0.0909,0.0666,0.9993,0.1053,0.5
std,0.073961,0.148632,0.418653,2.128631,0.115408,0.014141,0.286625,0.290727,0.301079,0.361254,0.492521,0.3924,0.287481,0.24934,0.02645,0.306955,0.500025
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.5
75%,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [233]:
domain = data[['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'Redirection', 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label' ]]
domain = domain.rename({
  "Domain": 'domain',
  "Have_IP": 'is_ip_address',
  'Have_At': 'have_at_sign',
  'URL_Length': 'is_long_url',
  'Redirection': 'redirection',
  'https_Domain': 'is_http',
  'TinyURL': 'is_shortened',
  'Prefix/Suffix': 'is_using_prefix',
  "Label": 'label',
}, axis=1)

domain.to_csv('data/domain.csv')

In [234]:
dns = data[['Domain', 'DNS_Record', 'Web_Traffic', 'Domain_Age', 'Domain_End', 'Label' ]]
dns = dns.rename({
  "Domain": 'domain',
  "DNS_Record": 'is_in_dns',
  "Web_Traffic": 'is_top_100k',
  "Domain_Age": 'is_domain_new',
  "Domain_End": 'is_domain_about_to_expire',
  "Label": 'label',
}, axis=1)

dns.to_csv('data/dns.csv')

In [235]:
html_js = data[['Domain', 'iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards', 'Label' ]]
html_js = html_js.rename({
  "Domain": 'domain',
  "iFrame": 'is_empty_iframe',
  "Mouse_Over": 'is_fake_status_bar',
  "Right_Click": 'is_disabled_right_click',
  "Web_Forwards": 'redirect_count',
  "Label": 'label',
}, axis=1)
html_js.to_csv('data/html_js.csv')

# Domain

### Preprocess

#### Drop domain

In [240]:
domain_data = domain.drop(['domain'], axis=1).copy()
domain_data.isnull().sum()

is_ip_address      0
have_at_sign       0
is_long_url        0
redirection        0
is_http            0
is_shortened       0
is_using_prefix    0
label              0
dtype: int64

#### Shuffle ordered frame

In [241]:
domain_data = domain_data.sample(frac=1).reset_index(drop=True)
domain_data.head()

Unnamed: 0,is_ip_address,have_at_sign,is_long_url,redirection,is_http,is_shortened,is_using_prefix,label
0,0,0,0,0,0,0,0,1
1,0,0,1,1,0,0,0,1
2,0,0,1,0,0,1,0,0
3,0,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0


### Splitting to X and y

In [246]:
y = domain_data['label']
X = domain_data.drop('label',axis=1)
X.shape, y.shape

((10000, 7), (10000,))

#### Splitting to train / test

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8000, 7), (2000, 7))

In [248]:
# Training
mlp_domain = MLPClassifier(alpha=0.001, hidden_layer_sizes=([100,100,100]))

# fit the model 
mlp_domain.fit(X_train, y_train)

#### Accuracy

In [250]:
domain_y_test = mlp_domain.predict(X_test)
domain_y_train = mlp_domain.predict(X_train)

In [253]:
domain_acc_train = accuracy_score(y_train, domain_y_train)
domain_acc_test = accuracy_score(y_test, domain_y_test)

print(f"Accuracy of MLP on domain with train data: {domain_acc_train}")
print(f"Accuracy of MLP on domain with test data: {domain_acc_test}")

Accuracy of MLP on domain with train data: 0.793125
Accuracy of MLP on domain with test data: 0.786


# Trying differenct models

In [258]:
from sklearn.ensemble import RandomForestClassifier

# instantiate the model
forest = RandomForestClassifier(max_depth=5)

# fit the model 
forest.fit(X_train, y_train)

In [259]:
#predicting the target value from the model for the samples
y_test_forest = forest.predict(X_test)
y_train_forest = forest.predict(X_train)

In [260]:
#computing the accuracy of the model performance
acc_train_forest = accuracy_score(y_train,y_train_forest)
acc_test_forest = accuracy_score(y_test,y_test_forest)

print("Random forest: Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random forest: Accuracy on test Data: {:.3f}".format(acc_test_forest))

Random forest: Accuracy on training Data: 0.793
Random forest: Accuracy on test Data: 0.786


In [261]:

#XGBoost Classification model
from xgboost import XGBClassifier

# instantiate the model
xgb = XGBClassifier(learning_rate=0.4,max_depth=7)
#fit the model
xgb.fit(X_train, y_train)

In [262]:
y_test_xgb = xgb.predict(X_test)
y_train_xgb = xgb.predict(X_train)

In [263]:
acc_train_xgb = accuracy_score(y_train,y_train_xgb)
acc_test_xgb = accuracy_score(y_test,y_test_xgb)

print("XGBoost: Accuracy on training Data: {:.3f}".format(acc_train_xgb))
print("XGBoost : Accuracy on test Data: {:.3f}".format(acc_test_xgb))

XGBoost: Accuracy on training Data: 0.793
XGBoost : Accuracy on test Data: 0.785


# Chose MLP

In [280]:
import pickle
# save mlp
with open("mlp_domain.pkl", "wb") as f:
  pickle.dump(mlp_domain, f, protocol=pickle.HIGHEST_PROTOCOL)

# DNS

In [282]:
dns_data = dns.drop(['domain'], axis=1).copy()
dns_data.isnull().sum()

is_in_dns                    0
is_top_100k                  0
is_domain_new                0
is_domain_about_to_expire    0
label                        0
dtype: int64

#### Shuffle ordered frame

In [283]:
dns_data = dns_data.sample(frac=1).reset_index(drop=True)
dns_data.head()

Unnamed: 0,is_in_dns,is_top_100k,is_domain_new,is_domain_about_to_expire,label
0,0,0,1,1,1
1,0,0,0,1,1
2,0,1,0,1,1
3,0,1,0,1,0
4,0,0,0,0,1


### Splitting to X and y

In [284]:
y = dns_data['label']
X = dns_data.drop('label',axis=1)
X.shape, y.shape

((10000, 4), (10000,))

#### Splitting to train / test

In [285]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8000, 4), (2000, 4))

In [296]:
# Training
mlp_dns = MLPClassifier(alpha=0.001, hidden_layer_sizes=([100,100,100]))

# fit the model 
mlp_dns.fit(X_train, y_train)

#### Accuracy

In [297]:
domain_y_test = mlp_dns.predict(X_test)
domain_y_train = mlp_dns.predict(X_train)

In [298]:
domain_acc_train = accuracy_score(y_train, domain_y_train)
domain_acc_test = accuracy_score(y_test, domain_y_test)

print(f"Accuracy of MLP on DNS with train data: {domain_acc_train}")
print(f"Accuracy of MLP on DNS with test data: {domain_acc_test}")

Accuracy of MLP on domain with train data: 0.56325
Accuracy of MLP on domain with test data: 0.5535


In [299]:
import pickle
# save mlp
with open("mlp_dns.pkl", "wb") as f:
  pickle.dump(mlp_dns, f, protocol=pickle.HIGHEST_PROTOCOL)

# HTML & JS

In [300]:
html_js_data = html_js.drop(['domain'], axis=1).copy()
html_js_data.isnull().sum()

is_empty_iframe            0
is_fake_status_bar         0
is_disabled_right_click    0
redirect_count             0
label                      0
dtype: int64

#### Shuffle ordered frame

In [301]:
html_js_data = html_js_data.sample(frac=1).reset_index(drop=True)
html_js_data.head()

Unnamed: 0,is_empty_iframe,is_fake_status_bar,is_disabled_right_click,redirect_count,label
0,0,0,1,0,0
1,0,0,1,0,1
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,1


### Splitting to X and y

In [302]:
y = html_js_data['label']
X = html_js_data.drop('label',axis=1)
X.shape, y.shape

((10000, 4), (10000,))

#### Splitting to train / test

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8000, 4), (2000, 4))

In [304]:
# Training
mlp_dns = MLPClassifier(alpha=0.001, hidden_layer_sizes=([100,100,100]))

# fit the model 
mlp_dns.fit(X_train, y_train)

#### Accuracy

In [305]:
domain_y_test = mlp_dns.predict(X_test)
domain_y_train = mlp_dns.predict(X_train)

In [306]:
domain_acc_train = accuracy_score(y_train, domain_y_train)
domain_acc_test = accuracy_score(y_test, domain_y_test)

print(f"Accuracy of MLP on HTML & JS with train data: {domain_acc_train}")
print(f"Accuracy of MLP on HTML & JS with test data: {domain_acc_test}")

Accuracy of MLP on domain with train data: 0.531875
Accuracy of MLP on domain with test data: 0.5235


In [307]:
import pickle
# save mlp
with open("mlp_html_js.pkl", "wb") as f:
  pickle.dump(mlp_dns, f, protocol=pickle.HIGHEST_PROTOCOL)