In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from io import StringIO

In [2]:
# fetch the PhishingVSBenignURL dataset
file_path = '/Users/nisha/Desktop/spring2024/MachineLearning/DataSetForPhishingVSBenignUrl.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.5,14,4.4,4,8,3,0,...,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.0,Defacement
1,0,4,5,5.5,14,6.0,4,12,4,0,...,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.0,-1.0,Defacement
2,0,4,5,5.5,14,5.8,4,12,5,0,...,0,0,-1,0.695049,0.784493,0.814725,0.80188,0.0,-1.0,Defacement
3,0,4,12,5.5,14,5.5,4,32,16,0,...,0,0,-1,0.64013,0.784493,0.814725,0.66321,0.0,-1.0,Defacement
4,0,4,6,5.5,14,7.333334,4,18,11,0,...,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.0,-1.0,Defacement


In [3]:
# encode target column "URL_Type_obf_Type" using LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(df['URL_Type_obf_Type'])
encoded_label = label_encoder.transform(df['URL_Type_obf_Type'])

# replace old "URL_Type_obf_Type" with encoded_label
feature_name = ['URL_Type_obf_Type']
encoded_df = pd.DataFrame(encoded_label, columns=feature_name)
df_encoded = pd.concat([df.drop('URL_Type_obf_Type', axis=1), encoded_df], axis=1)

print(df_encoded.head())

   Querylength  domain_token_count  path_token_count  avgdomaintokenlen  \
0            0                   4                 5                5.5   
1            0                   4                 5                5.5   
2            0                   4                 5                5.5   
3            0                   4                12                5.5   
4            0                   4                 6                5.5   

   longdomaintokenlen  avgpathtokenlen  tld  charcompvowels  charcompace  \
0                  14         4.400000    4               8            3   
1                  14         6.000000    4              12            4   
2                  14         5.800000    4              12            5   
3                  14         5.500000    4              32           16   
4                  14         7.333334    4              18           11   

   ldl_url  ...  SymbolCount_FileName  SymbolCount_Extension  \
0        0  ...             

In [4]:
# create X and y
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]

In [5]:
# check for infinite values in X
has_infinite = np.isinf(X)
if np.any(has_infinite):
    print("There are infinite values in the dataset.")
else:
    print("There are no infinite values in the dataset.")

There are infinite values in the dataset.


In [6]:
# sklearn's DecisionTreeClassifier can't work w/ infinite values
    # find exact location of infinite values
rows_with_infinite = np.any(has_infinite, axis=1)
cols_with_infinite = np.any(has_infinite, axis=0)

print(f"Rows with infinite values: {np.where(rows_with_infinite)[0]}")
print(f"Columns with infinite values: {np.where(cols_with_infinite)[0]}")

Rows with infinite values: [22422 23183 26468 27804 28029 28435 29619 29766 29944 29995]
Columns with infinite values: [31]


In [7]:
# drop samples w/ infinite values
arr = np.where(rows_with_infinite)[0]
for i in arr:
    X.drop(i,inplace=True)
    y.drop(i,inplace=True)

In [8]:
# check for NaN values in X
has_NaN = X.isnull()
if np.any(has_NaN):
    print("There are NaN values in the dataset.")
else:
    print("There are no NaN values in the dataset.")

There are NaN values in the dataset.


In [9]:
# sklearn's AdaBoostClassifier can't work w/ NaN values
    # find exact location of NaN values
rows_with_NaN = np.any(has_NaN, axis=1)
cols_with_NaN = np.any(has_NaN, axis=0)

print(f"Rows with NaN values: {np.where(rows_with_NaN)[0]}")
print(f"Columns with NaN values: {np.where(cols_with_NaN)[0]}")

Rows with NaN values: [    0     1     2 ... 36672 36675 36677]
Columns with NaN values: [ 5 65 66 75 76 77 78]


In [10]:
# impute NaN values w/ mean
    # there's too many rows w/ NaN values to drop
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imputed = imputer.fit_transform(X)

In [11]:
# split X and y into test and train data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [12]:
depths = [1, 3, 6, 9, 12, 15, 18]
criterion = ['gini', 'entropy']
accuracy = []
accuracy.append(['Impurity Measure', 'Depth', 'Train Accuracy', 'Test Accuracy'])

for i in depths:
    for j in criterion:
        tree = DecisionTreeClassifier(criterion=j, max_depth=i)
        tree.fit(X_train, y_train)
        clf = AdaBoostClassifier(estimator=tree, algorithm='SAMME')
        clf.fit(X_train, y_train)
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        curr_acc = [j, i, train_accuracy, test_accuracy]
        accuracy.append(curr_acc)

In [13]:
# compare results of DecisionTrees
accuracy_df = pd.DataFrame(accuracy)
print(accuracy_df)

                   0      1               2              3
0   Impurity Measure  Depth  Train Accuracy  Test Accuracy
1               gini      1        0.720748        0.72139
2            entropy      1        0.658242       0.656131
3               gini      3        0.857206       0.857221
4            entropy      3        0.850734       0.848365
5               gini      6        0.970263       0.949046
6            entropy      6         0.97074       0.949864
7               gini      9             1.0       0.978202
8            entropy      9             1.0       0.979155
9               gini     12             1.0       0.980654
10           entropy     12             1.0       0.980245
11              gini     15             1.0       0.980109
12           entropy     15             1.0       0.979428
13              gini     18             1.0       0.980381
14           entropy     18             1.0       0.978883


In [20]:
# last weeks results:
accuracy_str_lw = """
                   0      1               2              3
0   Impurity_Measure  Depth  Train_Accuracy  Test_Accuracy
1               gini      1        0.377218       0.375511
2            entropy      1        0.377218       0.375511
3               gini      2          0.4945       0.493326
4            entropy      2        0.495249       0.492373
5               gini      3        0.484727        0.48243
6            entropy      3        0.592712       0.586897
7               gini      4        0.675192       0.671343
8            entropy      4        0.669947       0.665622
9               gini      5        0.730598       0.730046
10           entropy      5        0.728214       0.728139
11              gini      6        0.754163       0.753609
12           entropy      6        0.737238       0.742713
"""
accuracy_io_lw = StringIO(accuracy_str_lw)
accuracy_df_lw = pd.read_csv(accuracy_io_lw, sep="\s+", engine='python')
print(accuracy_df_lw)

                   0      1               2              3
0   Impurity_Measure  Depth  Train_Accuracy  Test_Accuracy
1               gini      1        0.377218       0.375511
2            entropy      1        0.377218       0.375511
3               gini      2          0.4945       0.493326
4            entropy      2        0.495249       0.492373
5               gini      3        0.484727        0.48243
6            entropy      3        0.592712       0.586897
7               gini      4        0.675192       0.671343
8            entropy      4        0.669947       0.665622
9               gini      5        0.730598       0.730046
10           entropy      5        0.728214       0.728139
11              gini      6        0.754163       0.753609
12           entropy      6        0.737238       0.742713


Using AdaBoostClassifier resulted in far better classifiers than achieved last week, when just using DecisionTreeClassifiers. At deeper depths, the AdaBoostClassifier was actually able to achieve perfect train accuracy and near perfect test accuracy. For both AdaBoostClassifier and just DecisionTreeClassifier, the impurity measure has far less impact on accuracy than depth.