<a href="https://colab.research.google.com/github/monindew/Pytorch_Study/blob/main/20240819.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!kaggle datasets download -d shashwatwork/web-page-phishing-detection-dataset

Dataset URL: https://www.kaggle.com/datasets/shashwatwork/web-page-phishing-detection-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading web-page-phishing-detection-dataset.zip to /content
  0% 0.00/1.01M [00:00<?, ?B/s]
100% 1.01M/1.01M [00:00<00:00, 85.3MB/s]


In [2]:
!unzip 'web-page-phishing-detection-dataset.zip'

Archive:  web-page-phishing-detection-dataset.zip
  inflating: dataset_phishing.csv    


In [3]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [4]:
df_data = pd.read_csv("dataset_phishing.csv")
df_data.shape

(11430, 89)

In [5]:
df_data["status"]

Unnamed: 0,status
0,legitimate
1,phishing
2,phishing
3,legitimate
4,legitimate
...,...
11425,legitimate
11426,phishing
11427,legitimate
11428,legitimate


In [7]:
df_data['target'] = pd.get_dummies(df_data['status'])['legitimate'].astype('int')
df_data.drop('status', axis=1, inplace=True)
df_data[['url', 'target']].head(5)

Unnamed: 0,url,target
0,http://www.crestonwood.com/router.php,1
1,http://shadetreetechnology.com/V4/validation/a...,0
2,https://support-appleld.com.secureupdate.duila...,0
3,http://rgipt.ac.in,1
4,http://www.iracing.com/tracks/gateway-motorspo...,1


In [9]:
from sklearn.model_selection import train_test_split
X = df_data.iloc[:, 1:-1]
y = df_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(9144, 87) (9144,)
(2286, 87) (2286,)


In [14]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)

In [15]:
X_train_tensor = torch.from_numpy(std_scaler.transform(X_train)).float()
X_test_tensor = torch.from_numpy(std_scaler.transform(X_test)).float()
y_train_tensor = torch.from_numpy(y_train.values).float()
y_train_tensor = y_train_tensor.unsqueeze(1)
y_test_tensor = torch.from_numpy(y_test.values).float()
y_test_tensor = y_test_tensor.unsqueeze(1)

In [16]:
nb_epochs = 1000
minibatch_size = 256

In [19]:
class FunModel(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()

    self.linear_layers = nn.Sequential(
        nn.Linear(input_dim, 200),
        nn.LeakyReLU(0.1),
        nn.Linear(200, 100),
        nn.LeakyReLU(0.1),
        nn.Linear(100, 20),
        nn.LeakyReLU(0.1),
        nn.Linear(20, 5),
        nn.LeakyReLU(0.1),
        nn.Linear(5, output_dim),
        nn.Sigmoid()
    )

  def forward(self, x):
    y = self.linear_layers(x)
    return y

In [20]:
input_dim = X_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
print(input_dim, output_dim)
model = FunModel(input_dim, output_dim)
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

87 1


In [21]:
for index in range(nb_epochs):
  indices = torch.randperm(X_train_tensor.size(0))

  x_batch_list = torch.index_select(X_train_tensor, 0, index = indices)
  y_batch_list = torch.index_select(y_train_tensor, 0, index = indices)
  x_batch_list = x_batch_list.split(minibatch_size, 0)
  y_batch_list = y_batch_list.split(minibatch_size, 0)

  epoch_loss = list()
  for x_minibatch, y_minibatch in zip(x_batch_list, y_batch_list):
    y_minibatch_pred = model(x_minibatch)
    loss = loss_func(y_minibatch_pred, y_minibatch)
    epoch_loss.append(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  if index % 100 == 0:
    print(index, sum(epoch_loss) / len(epoch_loss))

0 tensor(0.5706, grad_fn=<DivBackward0>)
100 tensor(0.0219, grad_fn=<DivBackward0>)
200 tensor(0.0869, grad_fn=<DivBackward0>)
300 tensor(0.0868, grad_fn=<DivBackward0>)
400 tensor(0.0868, grad_fn=<DivBackward0>)
500 tensor(0.0869, grad_fn=<DivBackward0>)
600 tensor(0.0868, grad_fn=<DivBackward0>)
700 tensor(0.0934, grad_fn=<DivBackward0>)
800 tensor(0.0868, grad_fn=<DivBackward0>)
900 tensor(0.0868, grad_fn=<DivBackward0>)


In [36]:
y_pred_list = []
model.eval()
with torch.no_grad():
  y_test_pred_sigmoid = model(X_test_tensor)
  y_test_pred = torch.round(y_test_pred_sigmoid)

In [38]:
y_pred_list = list()
x_test_batch_list = X_test_tensor.split(minibatch_size, 0)
model.eval()
with torch.no_grad():
  for x_minibatch in x_test_batch_list:
    y_test_pred_sigmoid = model(x_minibatch)
    y_test_pred = torch.round(y_test_pred_sigmoid)
    y_pred_list.extend(y_test_pred.squeeze().detach().tolist())

y_pred_list = torch.tensor(y_pred_list).unsqueeze(1)

In [39]:
print(y_pred_list.shape, y_test_tensor.shape)

torch.Size([2286, 1]) torch.Size([2286, 1])


In [40]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

print("Confusion Matrix\n", str(confusion_matrix(y_test_tensor, y_test_pred)))
print("Precision:\t" + str(precision_score(y_test_tensor, y_test_pred)))
print("Recall:\t\t" + str(recall_score(y_test_tensor, y_test_pred)))
print("F1 Score:\t" + str(f1_score(y_test_tensor, y_test_pred)))

ValueError: Found input variables with inconsistent numbers of samples: [2286, 238]