# **Import**

In [None]:
# Import các thư viện cần thiết
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Thay thế bằng mô hình máy học phù hợp

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Load dataset**

In [None]:
# Load dataset vào DataFrame
df = pd.read_csv('/content/drive/MyDrive/PROJECTS/28 09/data_final.csv')

In [None]:
df.head(5)

Unnamed: 0,url,host,page_entropy,num_script_tags,script_to_body_ratio,html_length,page_tokens,num_sentences,num_punctuations,distinct_tokens,...,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,dns_record,page_rank,label
0,http://caixa.gov.br/empresa/credito-financiame...,200.201.165.253,5.048844,59,0.089609,175618,5542,982,10792,2140,...,0,0,0,0,0,0,0,0,5,normal
1,http://anandtech.com/show/9199/lg-announces-th...,18.66.97.83,4.655234,20,0.200274,80235,5602,740,7677,2442,...,0,0,0,0,1,0,155,0,6,normal
2,http://ap.org/Content/Press-Release/2013/AP-an...,23.42.164.175,4.889102,27,0.069433,43956,3151,359,5440,1555,...,0,0,0,1,1,0,1952,0,6,normal
3,http://ap.org/Content/AP-In-The-News/2014/AP-n...,23.42.164.175,4.890379,27,0.083729,36451,2109,292,5134,1039,...,0,0,0,1,1,0,1952,0,6,normal
4,http://mylust.com/videos/226488/stunning-young...,172.67.152.14,4.683243,32,0.126247,84596,4530,641,7958,1757,...,0,0,0,0,0,0,608,0,4,normal


# **Thống kê các giá trị missing (đã được đánh dấu bằng số 0) trong từng feature**

In [None]:
df.replace(0, np.nan, inplace=True)

In [None]:
missing_values = df.isnull().sum()
missing_values

url                              0
host                             0
page_entropy                     0
num_script_tags                 60
script_to_body_ratio            66
                              ... 
whois_registered_domain       1903
domain_registration_length     325
dns_record                    2008
page_rank                      233
label                            0
Length: 139, dtype: int64

In [None]:
missing_counts = df.isnull().sum()
missing_columns = missing_counts[missing_counts > 0]

print("Các cột có giá trị bị khuyết:")
print(missing_columns)
print("Tổng số lượng giá trị bị khuyết: ", missing_columns.sum())

Các cột có giá trị bị khuyết:
num_script_tags                 60
script_to_body_ratio            66
num_punctuations                20
num_hidden_tags               1846
num_iframes                   1225
                              ... 
domain_with_copyright         1096
whois_registered_domain       1903
domain_registration_length     325
dns_record                    2008
page_rank                      233
Length: 95, dtype: int64
Tổng số lượng giá trị bị khuyết:  130517


#**Thực hiện Scaling cho các numeric features**

In [None]:
numeric_features = [i for i in df.columns if df[i].dtype != 'object']

scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
df.head(5)

Unnamed: 0,url,host,page_entropy,num_script_tags,script_to_body_ratio,html_length,page_tokens,num_sentences,num_punctuations,distinct_tokens,...,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,dns_record,page_rank,label
0,http://caixa.gov.br/empresa/credito-financiame...,200.201.165.253,-0.091484,1.073689,-0.685022,-0.021556,-0.223,-0.419142,-0.297979,-0.16164,...,,,,,,,,,-0.291521,normal
1,http://anandtech.com/show/9199/lg-announces-th...,18.66.97.83,-0.803542,-0.327197,-0.229741,-0.380838,-0.214319,-0.477308,-0.340264,-0.03305,...,,,,,0.0,,-0.622697,,0.423075,normal
2,http://ap.org/Content/Press-Release/2013/AP-an...,23.42.164.175,-0.380464,-0.075756,-0.768028,-0.517491,-0.568904,-0.568884,-0.370631,-0.41073,...,,,,0.0,0.0,,1.549697,,0.423075,normal
3,http://ap.org/Content/AP-In-The-News/2014/AP-n...,23.42.164.175,-0.378154,-0.075756,-0.709214,-0.54576,-0.71965,-0.584988,-0.374785,-0.63044,...,,,,0.0,0.0,,1.549697,,0.423075,normal
4,http://mylust.com/videos/226488/stunning-young...,172.67.152.14,-0.752872,0.103845,-0.534292,-0.364411,-0.369405,-0.501103,-0.33645,-0.324719,...,,,,,,,-0.075065,,-1.006116,normal


#**Thực hiện Encoding cho các Cataloge features**

In [None]:
df.select_dtypes(include=['object']).columns

Index(['url', 'host', 'registration_date', 'expiration_date',
       'last_updates_dates', 'registrar', 'reg_country', 'host_country',
       'open_ports', 'isp', 'tld', 'scheme', 'label'],
      dtype='object')

In [None]:
categorical_features = df.select_dtypes(include=['object']).columns

encoder = LabelEncoder()

for col in categorical_features:
    df[col] = encoder.fit_transform(df[col])

In [None]:
df.head(5)

Unnamed: 0,url,host,page_entropy,num_script_tags,script_to_body_ratio,html_length,page_tokens,num_sentences,num_punctuations,distinct_tokens,...,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,dns_record,page_rank,label
0,157,325,-0.091484,1.073689,-0.685022,-0.021556,-0.223,-0.419142,-0.297979,-0.16164,...,,,,,,,,,-0.291521,0
1,46,228,-0.803542,-0.327197,-0.229741,-0.380838,-0.214319,-0.477308,-0.340264,-0.03305,...,,,,,0.0,,-0.622697,,0.423075,0
2,69,371,-0.380464,-0.075756,-0.768028,-0.517491,-0.568904,-0.568884,-0.370631,-0.41073,...,,,,0.0,0.0,,1.549697,,0.423075,0
3,53,371,-0.378154,-0.075756,-0.709214,-0.54576,-0.71965,-0.584988,-0.374785,-0.63044,...,,,,0.0,0.0,,1.549697,,0.423075,0
4,871,191,-0.752872,0.103845,-0.534292,-0.364411,-0.369405,-0.501103,-0.33645,-0.324719,...,,,,,,,-0.075065,,-1.006116,0


# **Thực hiện KNN hoặc Model based Inputation để "lấp" các giá trị missing**

In [None]:
def ktra(df):
  missing_counts_ = df.isnull().sum()
  missing_columns_ = missing_counts_[missing_counts_ > 0]

  print("Các cột có giá trị bị khuyết:")
  print(missing_columns_)
  print("Tổng số lượng giá trị bị khuyết: ", missing_columns_.sum())

In [None]:
ktra(df)

Các cột có giá trị bị khuyết:
num_script_tags                 60
script_to_body_ratio            66
num_punctuations                20
num_hidden_tags               1846
num_iframes                   1225
                              ... 
domain_with_copyright         1096
whois_registered_domain       1903
domain_registration_length     325
dns_record                    2008
page_rank                      233
Length: 92, dtype: int64
Tổng số lượng giá trị bị khuyết:  127821


## *Phương pháp Linear Interpolate*

In [None]:
df_linear = df.interpolate(method='linear')
#df.replace(np.nan, 0, inplace=True)

ktra(df_linear)

Các cột có giá trị bị khuyết:
num_hidden_tags                 8
num_embeds                     62
num_objects                    62
num_double_documents            5
num_eval_functions             22
                             ... 
domain_in_title                 2
domain_with_copyright           1
whois_registered_domain         5
domain_registration_length      1
dns_record                    318
Length: 63, dtype: int64
Tổng số lượng giá trị bị khuyết:  49251


In [None]:
df_linear.replace(np.nan, 0, inplace=True)

ktra(df_linear)

Các cột có giá trị bị khuyết:
Series([], dtype: int64)
Tổng số lượng giá trị bị khuyết:  0


#**Kiểm tra**

In [None]:
df_linear.head(10)

Unnamed: 0,url,host,page_entropy,num_script_tags,script_to_body_ratio,html_length,page_tokens,num_sentences,num_punctuations,distinct_tokens,...,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,dns_record,page_rank,label
0,157,325,-0.091484,1.073689,-0.685022,-0.021556,-0.223,-0.419142,-0.297979,-0.16164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.291521,0
1,46,228,-0.803542,-0.327197,-0.229741,-0.380838,-0.214319,-0.477308,-0.340264,-0.03305,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.622697,0.0,0.423075,0
2,69,371,-0.380464,-0.075756,-0.768028,-0.517491,-0.568904,-0.568884,-0.370631,-0.41073,...,0.0,0.0,0.0,0.0,0.0,0.0,1.549697,0.0,0.423075,0
3,53,371,-0.378154,-0.075756,-0.709214,-0.54576,-0.71965,-0.584988,-0.374785,-0.63044,...,0.0,0.0,0.0,0.0,0.0,0.0,1.549697,0.0,0.423075,0
4,871,191,-0.752872,0.103845,-0.534292,-0.364411,-0.369405,-0.501103,-0.33645,-0.324719,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.075065,0.0,-1.006116,0
5,241,424,0.506733,-0.722319,-0.511369,-0.211117,-0.367814,0.059168,-0.153179,-0.172711,...,0.0,0.0,0.0,0.0,0.0,0.0,0.737316,0.0,-0.291521,0
6,61,371,-0.390662,-0.075756,-0.707743,-0.546344,-0.718493,-0.585949,-0.37568,-0.636827,...,0.0,0.0,0.0,0.0,0.0,0.0,1.549697,0.0,0.423075,0
7,49,371,-0.37055,-0.075756,-0.710044,-0.545429,-0.714442,-0.585709,-0.375341,-0.622776,...,0.0,0.0,0.0,0.0,0.0,0.0,1.549697,0.0,0.423075,0
8,783,17,0.470812,-0.327197,-0.625512,-0.432065,-0.500331,-0.347275,-0.317147,-0.400511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.659946,0.0,1.13767,0
9,50,371,-0.389837,-0.075756,-0.737014,-0.533707,-0.633138,-0.574893,-0.373278,-0.516753,...,0.0,0.0,0.0,0.0,0.0,0.0,1.549697,0.0,0.423075,0
