## Import required libraries

In [1]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
dataset_path = '/kaggle/input/phishing-url'
dataset_files = os.listdir(dataset_path)
print("Files in the dataset folder:", dataset_files)

csv_file_path = os.path.join(dataset_path, dataset_files[0])   # Replace 'example.csv' with your file name
df = pd.read_csv(csv_file_path)

Files in the dataset folder: ['Data_processed.csv']


In [3]:
df.dtypes

url                            object
login_form                      int64
links_in_tags                 float64
submit_email                    int64
sfh                             int64
iframe                          int64
popup_window                    int64
safe_anchor                   float64
onmouseover                     int64
right_clic                      int64
whois_registered_domain         int64
domain_registration_length      int64
domain_age                      int64
web_traffic                     int64
dns_record                      int64
google_index                    int64
page_rank                       int64
status                         object
embedded_domain                 int64
having_ip_address               int64
no_of_dots                      int64
lexical_features               object
no_of_sensitive_words           int64
out_of_position_tld             int64
https_token                     int64
url_length                      int64
tinyURL     

## Remove columns with non-integer data types

In [4]:
df.drop(columns=['url','lexical_features','submit_email','sfh'], inplace=True)

In [5]:
df.columns

Index(['login_form', 'links_in_tags', 'iframe', 'popup_window', 'safe_anchor',
       'onmouseover', 'right_clic', 'whois_registered_domain',
       'domain_registration_length', 'domain_age', 'web_traffic', 'dns_record',
       'google_index', 'page_rank', 'status', 'embedded_domain',
       'having_ip_address', 'no_of_dots', 'no_of_sensitive_words',
       'out_of_position_tld', 'https_token', 'url_length', 'tinyURL',
       'prefixSuffix'],
      dtype='object')

## Standardization of data

In [6]:
scaler = StandardScaler()
# Select columns to standardize
columns_to_standardize = df.columns.tolist()
columns_to_standardize.remove('status')

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

## Train-test Split

In [7]:
df['target'] = np.where(df['status'] == 'phishing', 1, -1)
df.drop(columns=['status'], inplace=True)
Y=df['target']
X = df.drop(columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4,random_state=0) # creating testing and training set

In [8]:
df

Unnamed: 0,login_form,links_in_tags,iframe,popup_window,safe_anchor,onmouseover,right_clic,whois_registered_domain,domain_registration_length,domain_age,...,embedded_domain,having_ip_address,no_of_dots,no_of_sensitive_words,out_of_position_tld,https_token,url_length,tinyURL,prefixSuffix,target
0,-0.260624,0.674877,-0.03625,-0.077932,-0.948614,-0.033744,-0.03744,-0.28037,-0.549299,-1.307594,...,-1.044195,0.095825,0.379180,-0.316390,0.301770,-0.798018,-0.436245,-0.375276,-0.424327,-1
1,-0.260624,1.156557,-0.03625,-0.077932,1.610785,-0.033744,-0.03744,-0.28037,-0.510022,0.548471,...,0.957675,0.095825,-1.081073,-0.316390,0.301770,-0.798018,0.287212,-0.375276,-0.424327,1
2,-0.260624,1.156557,-0.03625,-0.077932,1.610785,-0.033744,-0.03744,-0.28037,-0.587348,-0.018839,...,0.957675,0.095825,1.109307,2.486975,-3.313779,1.253105,1.173447,-0.375276,2.356671,1
3,-0.260624,1.156557,-0.03625,-0.077932,0.651011,-0.033744,-0.03744,-0.28037,-0.528433,-1.307594,...,-1.044195,0.095825,-0.350946,-0.316390,0.301770,-0.798018,-0.779888,-0.375276,-0.424327,-1
4,3.836945,0.589875,-0.03625,-0.077932,-0.948614,-0.033744,-0.03744,-0.28037,-0.329595,1.323334,...,-1.044195,0.095825,-0.350946,-0.316390,0.301770,-0.798018,-0.110690,-0.375276,-0.424327,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,-0.260624,0.674877,-0.03625,-0.077932,-0.948614,-0.033744,-0.03744,-0.28037,-0.054659,0.429088,...,-1.044195,0.095825,-0.350946,-0.316390,0.301770,-0.798018,-0.291554,-0.375276,-0.424327,-1
11426,-0.260624,1.156557,-0.03625,-0.077932,-0.948614,-0.033744,-0.03744,-0.28037,-0.345551,0.857708,...,-1.044195,0.095825,1.839434,-0.316390,0.301770,-0.798018,0.413817,-0.375276,-0.424327,1
11427,-0.260624,-1.101319,-0.03625,-0.077932,1.098905,-0.033744,-0.03744,-0.28037,2.843221,1.432741,...,0.957675,0.095825,-0.350946,-0.316390,0.301770,1.253105,0.793632,-0.375276,-0.424327,-1
11428,-0.260624,-0.850444,-0.03625,-0.077932,-0.948614,-0.033744,-0.03744,-0.28037,-0.500203,-0.394685,...,0.957675,0.095825,-0.350946,-0.316390,0.301770,-0.798018,-0.418159,-0.375276,-0.424327,-1


<hr>
<center><h1> Creating SVM models </h1></center>


<hr>
<center><h1> Linear SVM: Hard Margin Classifier
</h1></center>

For n-dimensional feature space, SVM creates an n-1 dimensional hyperplane to seprate the classes.
Any hyperplane can be represented as:

$$
w^T X - b = 0
$$


![](https://storage.googleapis.com/kagglesdsdata/datasets/4643493/7905393/lin_svm.webp?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240321T131516Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=1bde03a92b323ad5c935bea16593a8f92b16344629597f56be2a859c39d3f564f4d314b53ca54fa4074a3dee83da6a82639bfc9f12ca61035d40774d95a806050b96f9e0016bbef90345f9d617aea1f372b6e282681ca8894f20834ddebb37a98003dfe79bbfbae3a697f0b8b5459f27a4c217d76258f741e22157b67ba4f16da470c0f6e5ebbbbcc6df4ede226ee422db5a81c351935393fb643ffd88b60ad91a09d4571b728df5eada8516a8335ad2a7671b0e87b8feed0fde5d4f55bc87a049d4b8444702dce826f1a1eeecea64cd5388219ea0d9c560469de6adb0398639279a8a1d3f622dfd33987230fee33d6ebfa9c9661dee5836543a8f41c000ef4f)

If the dataset can be linearly sperated, we can create two parallel hyperplane(lines here) for each classes and contain each data points under these hyperplanes and our goal would be to maximize the distance between these two parallel hyperplane. The region between these two plane is called **margin**. The equations for both hyperplane is:

$$
w^T X - b = -1
$$
$$
w^T X - b = 1
$$

The distance between them is 2/||w|| and to maximize the distance, ||w|| should be minimum. 

To prevent any data point falling inside margin we add the restriction,

$$
y_i(w^T X_i - b) \geq 1
$$

where y<sub>i</sub> = ith row in y
and X<sub>i</sub> = ith row in X

<div align="center">

### Objective Function:
$$
\min_{w} \left( \frac{||w||^2}{2} \right)
$$

### Linear Constraint:
$$
y_i (w^T x_i - b) \geq 1
$$

</div>

<hr>
<center><h1> Linear SVM: Soft Margin Classifier
</h1></center>


![](https://storage.googleapis.com/kagglesdsdata/datasets/4643565/7905497/soft.webp?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240321T132710Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3a6ed2f6859d27302904317dba4b1bd5a89c82affafccc46e41a12cd5989819d026d97e492949643280ae2796aff03d0027feb88c9409c738aaf907089e6970324fddf9de1bd981b37e95579a093cb5444d16e15fec3490536f2ee141b5398690e82a080edf66d805738cf2753933f168aff9a17729b2f0a7d86f74dbe94d2c6cba316979ffc5cc1adef2afbf557d719d10072875d649f5e2ba29d509efc87be96dc7ab2181ad4cee57c8e166f16ab8494ca0c2464d4be54935a36585215e9faafb4747ca03714b10d323404d826b1f4ba2a4fe7ed598793da0de04e8bafa944ce32dddbaf30d9c994640fb8b4b2fbd1665e6138ff8a71042605b0e436baee3a)

We have assumed that the dataset is perfectly linearly separable, which doesn’t really happen in real scenario. Some of the classes overlap such way that a perfect separation is impossible. Thera are mainly two possibilities:
1) Single outlier can push the decision boundary greatly, so that the margin becomes very narrow.

2) Even though a linear decision boundary can classify the target classes properly, the data may not be separable using a straight line.

### Slack Variable

We can solve this problem by introducing a new variable named Slack Variable and then redefine our inequality constraint as,

$$
y_i(w^T X_i - b) \geq 1 - \xi_i
$$


where ξ<sub>i</sub> is the slack variable for the point X<sub>i.

<div align="center">

### Objective Function:
$$
\min_{w, b, \xi_i} \left( \frac{||w||^2}{2} + C\sum_{i=1}^{n} \xi_i \right)
$$

### Linear Constraint:
$$
y_i (w^T x_i - b) \geq 1 - \xi_i, \quad \text{where} \quad \xi_i \geq 0
$$

</div>

C and k are constants which balance the cost of misclassification. C is a hyperparameter which controls the trade-off between maximizing the margin and minimizing the loss.
    
We can’t use Gradient Descent to optimize the Soft Margin Objective Function due to the inequality constraint. Hence need to first find a way to change the constraint to equality constraint.
    
So, we use hinge loss for loss function: <br>
$$
\xi_i = \max(0, 1 - y_i (w^T x_i - b))
$$

If the datapoint has class = 1, then the loss will be 0, otherwise it will be the distance between the margin and the datapoint.

![](https://storage.googleapis.com/kagglesdsdata/datasets/4644487/7906755/1_PGqpYm7o5GCbDXxXErr2JA.png?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240321T152959Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3303ef53700713ad57a9f87e8646aac63fb83c93e9573d90ace5b3ff816a2cc77a032f1bd20ab305c0623f5cd4c665fbeacadffa503408e229f04d387faa8bc5f3ea6669069e033e4c49323625d86b2c100fc256b34739a2cbe716fde04872017c74d5d0ba2f46fb322808d1c21e92e379c91815c8a58b25a29c4aa428272428d158ebb02f1343cf04b654ad0c8ca9c253f93583424d9f2744d7ce7bc7dde459c6487ceb3b9b0c25d2da7bc5aa4205cf761a1e8a64865919362b73a55da94be3f0731d353c1fbaeb13de2ba60dd34ee367bef12365a4aa42b56a35f7c8b18edcf0d38d2d690e80ef1a23160400b5dfe0f9bf721943503fd881e55b2008ec9005)

and our goal is to minimize

$$
L = ||w||^2 + C \sum_{i=1} \max(0, 1 - y_i (\beta^T x_i - b))
$$
 OR   
$$
L = \lambda ||w||^2 + \sum_{i=1} \max(0, 1 - y_i (\beta^T x_i - b))
$$


### Updating weights

In order to find the minima, we need to take derivative w.r.t w and b and then use them in Gradient Descent formula

<span style="background-color:transparent">Case 1: y<sub>i</sub>(w<sup>T</sup>X<sub>i</sub> -b) ≥ 1</span>
$$
    dw = 2\lambda w
$$
    $$
    db = 0
    $$

<span style="background-color:transparent">Case 2:y<sub>i</sub>(w<sup>T</sup>X<sub>i</sub> -b) \(\leq\) 1
</span>
$$
    dw = 2\lambda w - y_i x_i
$$
    $$
    db = y_i
    $$

In [9]:
class SVM_soft_margin:

    def __init__(self, alpha = 0.001, lambda_ = 0.01, n_iterations = 1000):
        self.alpha = alpha # learning rate
        self.lambda_ = lambda_ # tradeoff
        self.n_iterations = n_iterations # number of iterations
        self.w = None # weights or slopes
        self.b = None # intercept


    def fit(self, X, y):
        
        n_samples, n_features = X.shape       
        self.w = np.zeros(n_features) # initalizing with 0
        self.b = 0 # initialize with 0
        
        for iteration in range(self.n_iterations):
            for i, Xi in X.iterrows():
                # yi(xiw-b)≥1
                if y[i] * (np.dot(Xi, self.w) - self.b) >= 1 : 
                    self.w -= self.alpha * (2 * self.lambda_ * self.w) # w = w + α* (2λw)
                else:
                    self.w -= self.alpha * (2 * self.lambda_ * self.w - np.dot(Xi, y[i])) # w = w + α* (2λw - yixi)
                    self.b -= self.alpha * y[i] # b = b - α* (yi)
        return self.w, self.b


    def predict(self, X):
        pred = np.dot(X, self.w) - self.b 
        result = [1 if val > 0 else -1 for val in pred] # returning in the form of -1 and 1
        return result

In [10]:
svm = SVM_soft_margin()   # Initialize svm classifier
w,b = svm.fit(X_train,y_train)   # training

In [11]:
y_pred=svm.predict(X_test)
print("Accuracy: ",accuracy_score(y_pred,y_test))

Accuracy:  0.8639545056867891


<hr>
<br><br>
<center><h2> SVM using PEGASOS and Stochastic Gradient Descent </h2></center>

![j](https://miro.medium.com/max/700/1*R1GhsOV0UuMbnTwx8A47Zw.jpeg)

The above equation shows Pegasos algorithm, which is used with Stochastic Gradient Descent. We use Pegasos algorithm with stochastic gradient descent for faster convergence.
For implementing Stochastic Gradient Descent, we will append one more feature containing 1 to X and we will remove b. Thus, our equation <br>

y = w<sub>1</sub>X<sub>1</sub> + w<sub>2</sub>X<sub>2</sub> - b <br>

will change to<br>

y = w<sub>0</sub>X<sub>0</sub> + w<sub>1</sub>X<sub>1</sub> + w<sub>2</sub>X<sub>2</sub> <br>

but as x<sub>0</sub> = 1, thus w<sub>0</sub> = -b

<br>

In [12]:
# implementing SVM using Stochastic Gradient Descent and Pegasos algorithms

def SVM_SGD(X,Y,lambda_,epoches):
    
    X = np.c_[np.ones(X.shape[0]), X] # adding feature containing 1
    w = np.zeros(X.shape[1]) # initializing w with 0s
    
    # converting arrays to matrix
    X = np.matrix(X)
    Y = np.matrix(Y) 
    Y = Y.T
    w = np.matrix(w)

    for t in range(1,epoches+1):
        
        # choosing random points
        random_pos = random.randrange(0, X.shape[0]-1)
        x = X[random_pos,:]
        y = Y[random_pos,:]
    
        # computing eta and finding class
        eta = 1/(lambda_*t)
        cls = y * (w*x.T)

        if cls < 1:
            w = (1 - eta*lambda_)*w + eta*y*x
        elif cls >= 1:
            w = (1 - eta*lambda_)*w

    pred = w*X.T
    # converting matrix back to array
    pred = np.array(pred)
    return np.array(w)[0]
        

In [13]:
w = SVM_SGD(X_train,y_train,lambda_ = 0.01,epoches = 1000)

# Add bias feature to test data
X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]

# Predict labels for test data
pred = np.dot(X_test_bias, w)

result = [1 if val > 0 else -1 for val in pred]
print("SGD Accuracy: ", accuracy_score(result,y_test))

SGD Accuracy:  0.8541119860017498


<br><br>
<hr>
<br><br>
<center><h2>SVM for Non-Linear Dataset using Kernel</h2><center>

![](https://storage.googleapis.com/kagglesdsdata/datasets/4643587/7905527/non_lin.webp?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240321%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240321T132828Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3ab1e33b68cd5340aa1c5d4e8a59c06545d727dbb4f1c582b7bbb02eed4147047f8543b5e3c9a88919a410991a99e0539ec0232f5f19cfc59dc281802d1e05840b5e176afa42133535addccb4dfd64504082e98447fabe86eadcfbb2bac362a94e785c233e355fe0391525c724a3ba92beef7967d35fa2e9f918e359c6f7bee86be2cec4c32ca5fa520711c9616c3f30295cd2fde6b9c8ecb7a72a2b68e4338adb52ddfdaefe247e09df4e298deb7490943fea0e33d0a936bb08d1e31f583e1653f68d866c70fa3e87dfe6a68ff3f92a13ca894ca1783d3182468495ceea3a01ce7dc230869768a8b3bfb5b62af3f72ded0554b3a7ef376b30233c37df29f769)

We can use Linear Classifier for Non Linear Classification by expanding the input space to a feature space. We can use the Kernel Trick to implement Non Linear SVM, without the need to have any higher dimensional feature vector (Less time and space complexity) . The Objective Function of Primal Problem works fine for Linearly Separable Dataset, however doesn’t solve Non-Linear Dataset. However our final goal is to solve Non-Linear SVM where Primal Problem is not helpful. In order to find a solution,we generally solve its dual problem.

## Polynomial Kernel
$$
K_q(x, z) = (c + x^Tz)^q
$$

## Gaussian Kernel
$$
K(x, z) = \exp \left\{ - \frac{||x - z||^2}{2\sigma^2} \right\}
$$
It is important to note that the feature space of Gaussian Kernel has Infinite Dimensionality.

## Loss
The Dual Lagrangian loss function which we are trying to maximize is:

$$
L_{\text{dual}} = \sum \alpha_i - \frac{1}{2} \sum_i \sum_j \alpha_i \alpha_j y_i y_j K(x_i, x_j)
$$

## Gradient

Differentiating the loss wrt α<sub>k</sub> , using kth term for Gradient Ascent:

$$
\frac{\delta L_{\text{dual}}}{\delta \alpha_k} = 1 - y_k \sum_j \alpha_j y_j K(x_j, x_k)
$$

where, <br>

K(x<sub>i</sub>, x<sub>j</sub>) is our Kernal function which could be linear, polynomial or gaussian(rbf).

## Updates

α = α + η*(gradient)

where η = learning rate

After training, calculate intercept b:

$$
b = \text{avg}_{0 \leq \alpha_i \leq C} \left\{ y_i - \sum_j \alpha_j y_j K(x_j, x_i) \right\}
$$


## Prediction

For α>0 :

$$
\hat{y} = \text{sign} \left( \sum \alpha_i y_i k(x_i, x_i) + b \right)
$$



In [14]:
class SVM_Dual:

    def __init__(self, kernel='poly', degree=2, sigma=0.1, epoches=1000, learning_rate= 0.001):
        self.alpha = None
        self.b = 0
        self.degree = degree
        self.c = 1
        self.C = 1
        self.sigma = sigma
        self.epoches = epoches
        self.learning_rate = learning_rate

        if kernel == 'poly':
            self.kernel = self.polynomial_kernal # for polynomial kernal
        elif kernel == 'rbf':
            self.kernel =  self.gaussian_kernal # for guassian

    def polynomial_kernal(self,X,Z):
        return (self.c + X.dot(Z.T))**self.degree #(c + X.y)^degree


    def gaussian_kernal(self, X,Z):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        if isinstance(Z, pd.DataFrame):
            Z = Z.to_numpy()
        return np.exp(-(1 / self.sigma ** 2) * np.linalg.norm(X[:, np.newaxis] - Z[np.newaxis, :], axis=2) ** 2) #e ^-(1/ σ2) ||X-y|| ^2
    
    def train(self,X,y):
        self.X = X
        self.y = y
        self.alpha = np.random.random(X.shape[0])
        self.b = 0
        self.ones = np.ones(X.shape[0]) 

        y_mul_kernal = np.outer(y, y) * self.kernel(X, X) # yi yj K(xi, xj)

        for i in range(self.epoches):
            gradient = self.ones - y_mul_kernal.dot(self.alpha) # 1 – yk ∑ αj yj K(xj, xk)

            self.alpha += self.learning_rate * gradient # α = α + η*(1 – yk ∑ αj yj K(xj, xk)) to maximize
            self.alpha[self.alpha > self.C] = self.C # 0<α<C
            self.alpha[self.alpha < 0] = 0 # 0<α<C

            loss = np.sum(self.alpha) - 0.5 * np.sum(np.outer(self.alpha, self.alpha) * y_mul_kernal) # ∑αi – (1/2) ∑i ∑j αi αj yi yj K(xi, xj)
            
        alpha_index = np.where((self.alpha) > 0 & (self.alpha < self.C))[0]
        
        # for intercept b, we will only consider α which are 0<α<C 
        b_list = []        
        for index in alpha_index:
            b_list.append(y.iloc[index] - (self.alpha * y).dot(self.kernel(X, X.iloc[index])))

        self.b = np.mean(b_list) # avgC≤αi≤0{ yi – ∑αjyj K(xj, xi) }
            
    def predict(self, X):
        return np.sign(self.decision_function(X))
    
    def score(self, X, y):
        y_hat = self.predict(X)
        return np.mean(y == y_hat)
    
    def decision_function(self, X):
        return (self.alpha * self.y).dot(self.kernel(self.X, X)) + self.b

## Training SVM with RBF kernel

In [15]:
svm_dual = SVM_Dual(kernel='rbf')
svm_dual.train(X_train,y_train)



In [16]:
print("Accuracy with RBF kernel: ",svm_dual.score(X_test,y_test))

Accuracy with RBF kernel:  0.69750656167979


## Training SVM with Polynomial Kernel

In [17]:
svm_dual_poly = SVM_Dual(kernel='poly', degree=2)
svm_dual_poly.train(X_train,y_train)

In [18]:
print("Accuracy with Polynomial Kernel: ",svm_dual_poly.score(X_test,y_test))

Accuracy with Polynomial Kernel:  0.6128608923884514


## Results of standard SVM implementation

In [19]:
# Training
svm_lin = SVC(kernel='linear') 
svm_lin.fit(X_train, y_train) 

svm_poly = SVC(kernel='poly') 
svm_poly.fit(X_train, y_train) 

svm_rbf = SVC(kernel='rbf') 
svm_rbf.fit(X_train, y_train) 

SVC()

In [20]:
print("Accuracy of linear kernel:" ,accuracy_score(svm_lin.predict(X_test),y_test))
print("Accuracy of polynomial kernel:" ,accuracy_score(svm_poly.predict(X_test),y_test))
print("Accuracy of RBF kernel:" ,accuracy_score(svm_rbf.predict(X_test),y_test))

Accuracy of linear kernel: 0.8825459317585301
Accuracy of polynomial kernel: 0.9094488188976378
Accuracy of RBF kernel: 0.9230096237970253
