# Dataset Phishing

## Import Dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [2]:
df = pd.read_csv('./Phising_dataset_predict.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,NumDots,UrlLength,AtSymbol,NumDash,NumPercent,NumQueryComponents,IpAddress,HttpsInHostname,PathLevel,PathLength,NumNumericChars,Phising
0,0,3,72,0,0,0,0,0,0,5,44,0,1.0
1,1,3,144,0,0,0,2,0,0,3,16,41,1.0
2,2,3,58,0,0,0,0,0,0,2,24,0,1.0
3,3,3,79,0,1,0,0,0,0,6,50,0,1.0
4,4,3,46,0,0,0,0,0,0,4,29,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
662586,662586,3,39,0,0,0,0,0,0,4,39,1,1.0
662587,662587,2,44,0,2,0,0,0,0,4,44,1,1.0
662588,662588,2,42,0,0,0,0,0,0,4,42,1,1.0
662589,662589,2,45,0,0,0,0,0,0,3,45,0,1.0


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,NumDots,UrlLength,AtSymbol,NumDash,NumPercent,NumQueryComponents,IpAddress,HttpsInHostname,PathLevel,PathLength,NumNumericChars,Phising
215752,215752,2,61,0,3,0,0,0,0,5,61,0,0.0
630587,630587,2,29,0,0,0,0,0,0,2,29,0,1.0
474209,474209,2,76,0,0,2,0,0,0,4,76,1,0.0
94415,94415,1,49,0,0,0,0,0,0,4,26,1,0.0
392453,392453,3,86,0,1,0,1,0,0,1,10,1,0.0
123337,123337,1,13,0,0,0,0,0,0,1,13,1,1.0
118456,118456,1,41,0,0,0,0,0,0,4,41,1,0.0
39191,39191,2,51,0,1,0,1,0,0,2,26,1,0.0
276193,276193,2,103,0,13,0,0,0,0,2,87,1,0.0
253428,253428,3,159,0,3,0,1,0,0,4,38,1,0.0


## Data Preprocessing

In [5]:
# Ubah semua nama kolom jadi snake_case
df.columns = (
    df.columns
    .str.strip()                # buang spasi kalau ada
    .str.replace(' ', '_')      # ganti spasi jadi underscore
    .str.replace('-', '_')      # ganti tanda '-' jadi underscore
    .str.replace(r'([a-z])([A-Z])', r'\1_\2', regex=True)  # camelCase -> snake_case
    .str.lower()                # ubah semua huruf jadi kecil
)

In [6]:
dfColumns = pd.DataFrame(df.columns, columns=['Column Name'])
dfColumns

Unnamed: 0,Column Name
0,unnamed:_0
1,num_dots
2,url_length
3,at_symbol
4,num_dash
5,num_percent
6,num_query_components
7,ip_address
8,https_in_hostname
9,path_level


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 662591 entries, 0 to 662590
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   unnamed:_0            662591 non-null  int64  
 1   num_dots              662591 non-null  int64  
 2   url_length            662591 non-null  int64  
 3   at_symbol             662591 non-null  int64  
 4   num_dash              662591 non-null  int64  
 5   num_percent           662591 non-null  int64  
 6   num_query_components  662591 non-null  int64  
 7   ip_address            662591 non-null  int64  
 8   https_in_hostname     662591 non-null  int64  
 9   path_level            662591 non-null  int64  
 10  path_length           662591 non-null  int64  
 11  num_numeric_chars     662591 non-null  int64  
 12  phising               630071 non-null  float64
dtypes: float64(1), int64(12)
memory usage: 65.7 MB


In [8]:
df.drop(columns=['unnamed:_0'], inplace=True)

In [9]:
df.dropna(subset=['phising'], inplace=True)

In [10]:
df.isnull().sum()

num_dots                0
url_length              0
at_symbol               0
num_dash                0
num_percent             0
num_query_components    0
ip_address              0
https_in_hostname       0
path_level              0
path_length             0
num_numeric_chars       0
phising                 0
dtype: int64

In [11]:
X = df.drop(columns=["phising"], axis=True)   # semua fitur
y = df["phising"]                  # target

print(X.shape, y.shape)

(630071, 11) (630071,)


In [12]:
dfColumns = pd.DataFrame(df.columns, columns=['Column Name'])
dfColumns

Unnamed: 0,Column Name
0,num_dots
1,url_length
2,at_symbol
3,num_dash
4,num_percent
5,num_query_components
6,ip_address
7,https_in_hostname
8,path_level
9,path_length


## Training model

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')

In [15]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [77]:
cv_scores = cross_val_score(rf_model, x_train, y_train, cv=kf, scoring='accuracy')

KeyboardInterrupt: 

cv_scores.mean()

In [16]:
rf_model.fit(x_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
y_pred = rf_model.predict(x_test)
y_prob = rf_model.predict_proba(x_test)[:, 1]

In [18]:
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)

In [19]:
print(test_accuracy, test_precision, test_recall, test_f1, test_roc_auc)

0.8304011427211047 0.48 0.821917808219178 0.6060606060606061 0.9093175745106481


## Save Model

In [20]:
import joblib
joblib.dump(rf_model, 'phishing_detector_model.joblib', compress=3)

['phishing_detector_model.joblib']