In [19]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [20]:
dataset = pd.read_csv('/content/web-page-phishing.csv')

In [21]:
print("Any Null or missing values :",dataset.isnull().sum().sum())
print("Total number of Categories :",dataset['phishing'].unique())
print("Duplicates :",dataset.duplicated().sum())

Any Null or missing values : 0
Total number of Categories : [0 1]
Duplicates : 78186


In [22]:
dataset.drop_duplicates(inplace=True)
print("After removing duplicates :",dataset.duplicated().sum())

After removing duplicates : 0


In [23]:
df_majority = dataset[dataset['phishing'] == 1]
df_minority = dataset[dataset['phishing'] == 0]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # Sample with replacement
                                 n_samples=len(df_majority),  # Match majority class
                                 random_state=42)  # For reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])

# Display the class distribution after upsampling
print(df_upsampled['phishing'].value_counts())

0    15872
1    15872
Name: phishing, dtype: int64


In [24]:
X = df_upsampled.iloc[:,:-1]
y = df_upsampled.iloc[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

Performing Standardization as a part of preprocessing, for a better fit for ML model.

In [25]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train.values)
X_test_scaled = sc.transform(X_test.values)

# Random Forest Classifier

In [26]:
rfc = RandomForestClassifier(n_estimators=100,random_state=42)
rfc.fit(X_train_scaled,y_train)

# Classification Report


In [27]:
import pickle
pickle.dump(rfc,open('model.pkl','wb'))