<a href="https://colab.research.google.com/github/neal-logan/dsba6211-summer2024/blob/main/nophishing/01_exploratory_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 01 Exploratory Analysis


#### Load and Prepare Data

In [None]:
# Load and prepare training data
import pandas as pd

train_url = 'https://raw.githubusercontent.com/neal-logan/dsba6211-summer2024/main/nophishing/data/phishing-url-pirochet-train.csv'
df = pd.read_csv(train_url)

#Create numeric target variable column
df['y'] = df['status'].replace('legitimate', 0).replace('phishing', 1)

#Drop unnecessary columns
df = df.drop(columns=['status','url'])

#X/y split
X = df.drop(columns=['y'])
y = df['y']

Unnamed: 0.1,Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,0,https://www.todayshomeowner.com/how-to-make-ho...,82,23,0,2,7,0,0,0,...,1,1,0,240,8892,67860,0,1,4,legitimate
1,1,http://thapthan.ac.th/information/confirmation...,93,14,1,2,0,0,0,0,...,1,0,1,0,2996,4189860,0,1,2,phishing
2,2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,121,21,1,3,0,0,0,0,...,1,1,0,30,2527,346022,0,1,3,phishing
3,3,https://www.bedslide.com,24,16,0,2,0,0,0,0,...,0,0,0,139,7531,1059151,0,0,4,legitimate
4,4,https://tabs.ultimate-guitar.com/s/sex_pistols...,73,24,0,3,1,0,0,0,...,0,0,0,3002,7590,635,0,1,5,legitimate


In [None]:
#Split training set into training and validation set (test set not yet loaded)

from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42)



#### Categorize Features by Source

In [None]:
#TODO - split features into categories based on origin: from URL, from site, or from third parties

X_url_train = pd.DataFrame()
X_site_train = pd.DataFrame()
X_3p_train = pd.DataFrame()


In [None]:
#Recombine X,y training data for exploration
Xy_train = X_train.copy()
Xy_train['y'] = y_train


#### Overview of the Data

Most of the columns are a mix of binary, small discrete numbers, or ratios in decimal format.  Some columns have significantly larger values.  Some columns contain negative values that are apparently invalid.

While the column names follow some degree of convention, there's no simple way to delineate how each of them should be handled.  I will need to analyze each feature individually and organize a preprocessing pipeline that takes into account what each feature needs.

Some of the fields are obtained from third-party providers.  Because these might not always be available, I will develop a set of models that use this data as well as a second set of models that do not.

In [None]:
Xy_train.shape

(6126, 89)

In [None]:
#Calculate correlation matrix
corr_matrix = Xy_train.corr().abs()

In [None]:
corr_matrix['y'].sort_values(ascending=False).head(25)

Unnamed: 0,y
y,1.0
google_index,0.738908
page_rank,0.503734
nb_www,0.434337
ratio_digits_url,0.357516
domain_in_title,0.338552
nb_hyperlinks,0.334818
phish_hints,0.326646
domain_age,0.325361
ip,0.316615


In [None]:
#Summary Stats
import numpy as np

includes = [np.number]

for i in range (0,90,14):
    print(df.iloc[:,i:i+14].describe(include=includes))

In [None]:
#Boxplots
import pandas as pd
import matplotlib.pyplot as plt

# Define the number of columns per plot
columns_per_plot = 14

# Split the numeric columns into chunks
chunks = [Xy_train[i:i + columns_per_plot] for i in range(0, len(Xy_train), columns_per_plot)]

# Create boxplots for each chunk
for i, chunk in enumerate(chunks):
    plt.figure()
    df[chunk].boxplot()
    plt.title(f'Boxplots for Columns {i * columns_per_plot + 1} to {(i + 1) * columns_per_plot}')
    plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
    plt.show()


#### Group coorelated variables & look at parallel coordinate plots

In [None]:
# Group


#Create empty dataframes

dfs_eda = []
for i in range(0,5):
  dfs_eda.append(pd.DataFrame)

low_corr_threshold = 0.25

for col in corr_matrix.columns:

  #Get the next most-correlated column other than col itself
  most_correlated = corr_matrix[col].sort_values(ascending=False).index[1]
  correlation = corr_matrix.loc[col, most_correlated]

  for df in dfs_eda:
    if correlation > low_corr_threshold:
      if df.empty:
        df[col] = Xy_train[col]


In [1]:
%%capture
!pip install hiplot==0.1.33


In [None]:
import hiplot as hip

# add y to df1
df['y'] = y_train

# convert df1 to list of dicts because hiplot requires
df1_list = df1.to_dict('records')
hip.Experiment.from_iterable(df1_list).display()

#### Small Multiples -
heatmaps showing valid/bad ratio based on 2 variables at a time

#### Preliminary Modeling - Feature Importance

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
import pandas as pd

# Define model evaluation function

def print_model_evaluation(
    title: str,
    pipe : Pipeline,
    X : pd.DataFrame,
    y : pd.DataFrame):

    print("\n" + title)
    pred_y = pipe.predict(X)
    print(confusion_matrix(pred_y, y))
    print("\nROC-AUC: " + str(roc_auc_score(pred_y, y)))
    print("Precision: " + str(precision_score(pred_y, y)))
    print("Recall: " + str(recall_score(pred_y, y)))

####Logistic Regression

In [None]:
#Set up pipeline

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(
      StandardScaler(),
      LogisticRegression(random_state=42)
)

pipe_lr.fit(X_train, y_train)


In [None]:
print_model_evaluation("Logistic Regression\nPerformance on Training Set",
                       pipe_lr, X_train, y_train)

print_model_evaluation("Logistic Regression\nPerformance on Validation Set",
                       pipe_lr, X_validation, y_validation)

#### Random Forest

In [None]:
#Set up & run pipeline - random forest

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipe_rf = make_pipeline(
      StandardScaler(),
      RandomForestClassifier(random_state=42)
)

pipe_rf.fit(X_train, y_train)


In [None]:
print_model_evaluation("Random Forest\nPerformance on Training Set",
                       pipe_rf, X_train, y_train)

print_model_evaluation("Random Forest\nPerformance on Validation Set",
                       pipe_rf, X_validation, y_validation)


#### Gradient-boosted Trees

In [None]:
# Set up and run pipeline - gradient boosted trees

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline

pipe_gbt = make_pipeline(
      StandardScaler(),
      HistGradientBoostingClassifier(random_state=42)
)

pipe_gbt.fit(X_train, y_train)

In [None]:
print_model_evaluation("Gradient-boosted Trees\nPerformance on Training Set",
                       pipe_gbt, X_train, y_train)

print_model_evaluation("Gradient-boosted Trees\nPerformance on Validation Set",
                       pipe_gbt, X_validation, y_validation)

### Feature Engineering and Selection

### Preprocessing