<a href="https://colab.research.google.com/github/Scox97/DS7331_Group_Project/blob/main/DS7331_GP2_Mini_Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DS7331 Project: Phishing Dataset

- Hayoung Cheon
- Steven Cox
- Erika Dupond
- Miguel


## Mini-Lab: Logistic Regression and SVMs

You are to perform predictive analysis (classification) upon a data set: model the dataset using
methods we have discussed in class: logistic regression and support vector machines, and making
conclusions from the analysis.

### General Import

In [None]:
# general
import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# SKLearn
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
warnings.filterwarnings("ignore")

TABLE_LAYOUT = [
    {'selector': 'table', 'props': [('width', '100%')]},
    {'selector': 'thead th', 'props': [('text-align', 'center')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
]

%matplotlib inline


In [None]:
# URL to the dataset
url = "https://archive.ics.uci.edu/static/public/967/phiusiil+phishing+url+dataset.zip"

# Read the CSV file from the URL, ignoring the first column (index 0)
df = pd.read_csv(url, encoding="utf-8")
df = df.drop("FILENAME", axis=1)  # Column "FILENAME" can be ignored.
print("\nShape of data:", df.shape)
df.head()


In [None]:
df.describe()

### Encoding and Finding High Cardinality Features


In [None]:
# Helper functions
"""
# Optional Code: This is just to get the exercise going.

- Since URL and others high cardinality, can cause the model not to scale.
- In Addition, it is expensive computationally to produce such high cardinality features.
- There is an opportunity to do a Transformation to continuous attribute

Reference on:
Julie Moeyersoms, David Martens
Including high-cardinality attributes in predictive models: A case study in churn prediction in the energy sector, Decision Support Systems, Volume 72,2015
Pages 72-81,
ISSN 0167-9236
https://www.sciencedirect.com/science/article/pii/S0167923615000275#s0120

Hash Encoding:  https://towardsdatascience.com/4-ways-to-encode-categorical-features-with-high-cardinality-1bc6d8fd7b13/
"""


def shannon_entropy(text):
    if not text or not isinstance(text, str):
        return 0
    counts = Counter(text)
    probs = [freq / len(text) for freq in counts.values()]
    return -sum(p * np.log2(p) for p in probs if p > 0)


def pre_process_data(
    input_df, drop_cols=[], use_entropy=True, include_aggregates=False
):
    df = input_df.copy()
    y = df["label"]

    X = df.drop(columns=[col for col in drop_cols if col in df.columns] + ["label"])

    binary_cols = X.nunique() == 2
    for col in X.columns[binary_cols]:
        X[col] = X[col].astype(int)

    categorical_cols = X.select_dtypes(include=["object"]).columns
    low_cardinality_cols = [col for col in categorical_cols if X[col].nunique() < 200]
    high_cardinality_cols = [col for col in categorical_cols if X[col].nunique() >= 200]

    if use_entropy:
        for col in ["URL", "Domain", "Title"]:
            if col in df.columns:
                X[f"{col}_entropy"] = df[col].astype(str).apply(shannon_entropy)
                print(f"Entropy for {col}: {X[f'{col}_entropy'].mean():.4f}")

        X = X.drop(
            columns=[col for col in high_cardinality_cols if col in X.columns],
            errors="ignore",
        )
    else:
        hashed_dfs = []
        for col in high_cardinality_cols:
            unique_count = X[col].nunique()
            n_features = (
                10 if unique_count < 200 else min(max(unique_count // 10, 10), 500)
            )
            print(
                f"Hashing {col} with {n_features} features (from {unique_count} unique values)"
            )
            hasher = FeatureHasher(n_features=n_features, input_type="string")
            hashed_features = hasher.fit_transform(
                X[col].astype(str).values.reshape(-1, 1)
            )
            hashed_df = pd.DataFrame(
                hashed_features.toarray(),
                columns=[f"{col}_hash_{i}" for i in range(n_features)],
            )
            hashed_dfs.append(hashed_df)
            X = X.drop(columns=[col])
        if hashed_dfs:
            hashed_combined = pd.concat(hashed_dfs, axis=1).reset_index(drop=True)
            X = pd.concat([X.reset_index(drop=True), hashed_combined], axis=1)

    for col in low_cardinality_cols:
        dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
        X = pd.concat([X.drop(columns=[col]), dummies], axis=1)
        print(f"Dummy encoding {col} with {dummies.shape[1]} features")

    if include_aggregates:
        special_cols = [
            "NoOfEqualsInURL",
            "NoOfQMarkInURL",
            "NoOfAmpersandInURL",
            "NoOfOtherSpecialCharsInURL",
        ]
        resource_cols = ["NoOfImage", "NoOfCSS", "NoOfJS", "NoOfExternalRef"]
        if all(col in df.columns for col in special_cols):
            X["TotalSpecialChars"] = df[special_cols].sum(axis=1)
            print("Added TotalSpecialChars")
        if all(col in df.columns for col in resource_cols):
            X["TotalExternalResources"] = df[resource_cols].sum(axis=1)
            print("Added TotalExternalResources")

    return X, y


def check_leakage(df, columns=["URL", "Title"]):
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    leakages = []
    for i, col in enumerate(columns):
        if col in df.columns:
            unique_vals = df.groupby(col)["label"].nunique()
            leaks = (unique_vals == 1).sum()
            total = len(unique_vals)
            leak_pct = leaks / total

        leakages.append(
            {
                "feature": col,
                "total": total,
                "leaks": leaks,
                "leak_pct": float(leak_pct * 100),
            }
        )

        categories = ["Leakage", "No Leakage"]
        values = [leaks, total - leaks]

        bars = axes[i].bar(categories, values)
        axes[i].set_title(f"Data Leakage in {col} Column")
        axes[i].set_ylabel("Count of Unique Values")

        for bar in bars:
            height = bar.get_height()
            axes[i].annotate(
                f"{height}",
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha="center",
                va="bottom",
            )

    plt.tight_layout()
    plt.show()

    leakages = pd.DataFrame(leakages)
    display(leakages)


def pca_analysis(df):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)

    pca = PCA().fit(X_scaled)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.title("PCA: Explained Variance")
    plt.grid(True)
    plt.show()

# Validation Function
def validate_holdout(X_train, X_holdout):
    train_hashes = pd.util.hash_pandas_object(pd.DataFrame(X_train)).values
    holdout_hashes = pd.util.hash_pandas_object(pd.DataFrame(X_holdout)).values
    overlap = np.intersect1d(train_hashes, holdout_hashes)
    print(f"Number of overlapping samples: {len(overlap)}")

    y_pred = pipeline.predict(X_holdout)
    cm = confusion_matrix(y_holdout, y_pred)
    tn, fp, fn, tp = cm.ravel()

    accuracy = accuracy_score(y_holdout, y_pred)
    precision = precision_score(y_holdout, y_pred, zero_division=0)
    recall = recall_score(y_holdout, y_pred, zero_division=0)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    f1 = f1_score(y_holdout, y_pred, zero_division=0)

    metrics_df = pd.DataFrame({
        "Metric": [
            "Accuracy",
            "Precision (PPV)",
            "Recall (Sensitivity)",
            "Specificity (TNR)",
            "Negative Predictive Value (NPV)",
            "F1 Score",
        ],
        "Value": [accuracy, precision, recall, specificity, npv, f1],
    })

    metrics_df["Value"] = (metrics_df["Value"] * 100).round(2).astype(str) + "%"
    display(metrics_df)

    conf_matrix_df = pd.DataFrame(
        cm,
        index=["Actual 0 (Non-phishing)", "Actual 1 (Phishing)"],
        columns=["Predicted 0 (Non-phishing)", "Predicted 1 (Phishing)"]
    )
    display(conf_matrix_df)

    labels = ['0', '1']
    conf_mat = confusion_matrix(y_holdout, y_pred)

    sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

#### PCA Analysis and Scaling

In [None]:
# PCA Check on preprocessed data
"""
Pre-process and encoding features. We are exploring those categorical columns looking and addressing:

- High cardinality features and how with more advanced encoding (hashing) can improve the variance and reducing the feature space.
"""
X, y = pre_process_data(df, use_entropy=False)
pca_analysis(X)

##### PCA Summary - Feature Space

The feature space has 1619 features (original + hashed)

###### Explained Variance Distribution

- The cumulative explained variance curve increases linearly with the number of components.
- About 1200 components are needed to explain ~80% of the total variance.
- The first few components do not capture a dominant share of variance, suggesting:
    - Our data is high-dimensional, with variance spread out across many features.
    - Little redundancy is apparent; most features contribute independently to variance.

###### Implications

- Dimensionality reduction (PCA) won’t significantly condense information into a small number of components.
- The hashing of high-cardinality features (e.g., URL, Title) spreads variance across many features, as each hashed bucket captures partial variance.

In [None]:
# Logistic Regression with Lasso with Hashed and Dummy encoded data - Warning this will take a while
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(
#     X_scaled, y, test_size=0.3, stratify=y, random_state=42
# )

# lasso = LogisticRegression(
#     C=0.05,
#     verbose=2,
#     l1_ratio=0.5,
#     max_iter=250,
#     solver="saga",
#     penalty="elasticnet",
# )

# lasso.fit(X_train, y_train)
# y_pred_proba = lasso.predict_proba(X_test)[:, 1]
# auc_lasso = roc_auc_score(y_test, y_pred_proba)
# print(f"Logistic Regression AUC: {auc_lasso:.3f}")

##### Leakages summary


- Initial Findings
  - Engineered hashed features from `URL` and `Title`, those categorical features initially displayed significant signals (domain knowledge).

- Discovery of Data Leakage
  - Analysis showed:
    - 100% of `URL`s correspond to a unique label – direct leakage.
    - 99.9% of `Title`s correspond to a unique label.
  - Hashing did not prevent leakage, as uniqueness was preserved in the hash.

- Dimensionality & Redundancy Check
  - PCA revealed:
    - ~80% variance explained by ~1200 components of ~1600 total features.

- Logistic Regression Modeling
  - Trained models with scikit-learn and PyTorch (LASSO).
  - Observed AUC=1.0 – clear overfitting due to leakage from raw `URL` and `Title`.

- Key Decision: Feature Dropping
  - Dropped `URL`, `Title`, and their hashed versions to eliminate data leakage.
  - Retained engineered features (e.g., `URLLength`, `ObfuscationRatio`, `LetterRatioInURL`).
- Recognized Logistic Regression’s limitations for high-dimensional, correlated data.

- Learning & Next Steps
  - Valuable lesson on how high-cardinality features can introduce leakage.
  - What to do next:
    - Focus on regularization, cross-validation, and robust validation strategies.


In [None]:
# Finding features that could be dropped
def plot_relation_dist(ax, df, feature):
    data_0 = df[df['label'] == 0][feature]
    data_1 = df[df['label'] == 1][feature]

    if data_0.std() > 0:
        data_0.plot(kind='kde', ax=ax, label='Label 0 (Non-phishing)', alpha=0.5)

    ax.hist(data_1, bins=1, range=(-0.5,0.5), density=True, alpha=0.5, label='Label 1 (Phishing)')

    ax.set_xlim(-2, 2)
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.set_title(f'Distribution of {feature}')
    ax.grid(alpha=0.3)
    ax.legend()

potential_drop = [
    "NoOfImage",
    "ObfuscationRatio",
    "NoOfObfuscatedChar",
    "NoOfQMarkInURL",
    "NoOfEqualsInURL",
    "NoOfAmpersandInURL",
]

n_features = len(potential_drop)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for i, feature in enumerate(potential_drop):
    plot_relation_dist(axes[i], df, feature)

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

df[potential_drop].describe()

Noticed these features were not that useful; starting with stats analysis, the low mean and standard deviation (close to 0) raised suspicions. Visualizations confirmed these features could act as shortcuts for class separation, especially in Logistic Regression, leading to overfitting and poor generalization.

### Modeling Logistic Regression

#### Feature Selection

In [None]:
## Feature Selection
base_drop = [
    "URL",
    "Domain",
    "Title",
    "TLD",
    "URLSimilarityIndex",
    "HasSocialNet",
    "HasCopyrightInfo",
    "HasTitle",
    "IsHTTPS",
    "IsResponsive",
    "IsDomainIP",
    "HasDescription",
    "HasObfuscation",
    "HasSubmitButton",
    "HasExternalSubmitForm",
]
potential_drop = [
    "NoOfImage",
    "ObfuscationRatio",
    "NoOfObfuscatedChar",
    "NoOfQMarkInURL",
    "NoOfEqualsInURL",
    "NoOfAmpersandInURL",
]
drop = base_drop + potential_drop

X_final, y_final = pre_process_data(
    df, drop_cols=drop, use_entropy=False, include_aggregates=False
)

continuous_features = [col for col in X_final.columns if X_final[col].nunique() > 2]

skewness = X_final[continuous_features].skew().sort_values(ascending=False)
print(f"\nFeatures with high skewness:")
print(skewness)

low_variance = X_final[continuous_features].std().sort_values()
print(f"\n=== Features with low variance:")
print(low_variance[low_variance < 0.1])

X_final[continuous_features].hist(bins=50, figsize=(20,15))
plt.suptitle("\n=== Feature Distributions")
plt.show()


In [None]:
## Feature Analysis
numerical_cols = [
    col for col in X_final.select_dtypes(include=['float64', 'int64'])
    if X_final[col].nunique() > 2
]

num_cols = 5
num_rows = int(np.ceil(len(numerical_cols) / num_cols))
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5*num_cols, 4*num_rows))
axes = axes.flatten()

y_aligned = y_final.reset_index(drop=True)
X_aligned = X_final.reset_index(drop=True)

for i, feature in enumerate(numerical_cols):
    ax = axes[i]
    data_non_phishing = X_aligned[feature][y_aligned == 0].dropna()
    data_phishing = X_aligned[feature][y_aligned == 1].dropna()

    if data_non_phishing.nunique() > 1:
        sns.kdeplot(data_non_phishing, fill=True, color='skyblue', label='Non-Phishing', alpha=0.5, ax=ax)
    if data_phishing.nunique() > 1:
        sns.kdeplot(data_phishing, fill=True, color='salmon', label='Phishing', alpha=0.5, ax=ax)

    ax.set_title(f"{feature} by Target")
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(alpha=0.3)

for j in range(len(numerical_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(pad=2.0)
plt.show()


df_temp = X_final.copy()
df_temp['label'] = y_final
corr_pearson = df_temp.corr(method='pearson')['label'].sort_values(ascending=False)
corr_spearman = df_temp.corr(method='spearman')['label'].sort_values(ascending=False)

corr_df = pd.DataFrame({'Pearson': corr_pearson, 'Spearman': corr_spearman}).drop(index='label')
print("\nCorrelations with Target:\n")
display(corr_df.sort_values(by='Pearson', key=abs, ascending=False))

In [None]:
## Feature Analysis

feature_correlations = X_final.copy()
feature_correlations["label"] = y_final
corrs = feature_correlations.corr()["label"].sort_values(ascending=False)

binary_features = []
for col in X_final.columns:
    unique_vals = X_final[col].nunique()
    if unique_vals == 2:
        binary_features.append(col)

num_cols = 6
total_plots = 1 + len(binary_features)
num_rows = math.ceil(total_plots / num_cols)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(28, 5 * num_rows))
axes = axes.flatten()

top_n = 15
top_corrs = corrs.iloc[:top_n]
bottom_corrs = corrs.iloc[-top_n:] if len(corrs) > top_n else pd.Series()
plot_corrs = pd.concat([top_corrs, bottom_corrs])
colors = ["#1e88e5" if c > 0 else "#ff0d57" for c in plot_corrs]

ax0 = axes[0]
plot_corrs.plot(kind="barh", ax=ax0, color=colors)
ax0.set_title("Top Feature Correlations with Target", fontsize=12)
ax0.axvline(x=0, color="black", linestyle="-", alpha=0.3)
ax0.grid(axis="x", linestyle="--", alpha=0.5)
ax0.set_xlabel("Correlation Coefficient")

for i, feature in enumerate(binary_features):
    if i + 1 < len(axes):
        ax = axes[i + 1]
        ct = pd.crosstab(X_final[feature], y_final, normalize="index") * 100

        ct.plot(kind="bar", stacked=True, ax=ax)

        for c in ax.containers:
            ax.bar_label(c, fmt="%.1f%%")

        ax.set_title(f"{feature} vs Target", fontsize=12)
        ax.set_ylabel("Percentage")
        ax.set_xlabel("Feature Value")
        ax.legend(title="Target", labels=["Non-Phishing (0)", "Phishing (1)"])

        ct_raw = pd.crosstab(X_final[feature], y_final)
        total = ct_raw.sum().sum()
        perfect_split = (ct_raw == 0).any().any()

        if perfect_split:
            ax.text(
                0.5,
                -0.15,
                "Potential data leakage detected!",
                horizontalalignment="center",
                color="red",
                transform=ax.transAxes,
                fontsize=11,
            )

for i in range(total_plots, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout(pad=2.0)
plt.show()

After deep dive after looking those features where mean and std deviation looked suspicious, we decided to deep dive on the features for skewness and low variance. Those were candidates to log transform to have a more constant variance that makes more appropriate for Logistic Regression.

#### Logistic Regression Modeling

For the modeling phase we decided to create a pipeline where we transform those candidate features into log space and then standardize the rest of the features. Seemed appropriate after multiple runs to expose the model to a new data by creating a holdout set.


In [None]:
## Logistic Regression
high_skew_features = [
    "NoOfCSS",
    "NoOfJS",
    "NoOfEmptyRef",
    "NoOfiFrame",
    "NoOfDegitsInURL",
    "NoOfPopup",
    "NoOfExternalRef",
    "NoOfSelfRef",
    "NoOfLettersInURL",
    "URLLength",
    "LineOfCode",
    "LargestLineLength",
    "NoOfOtherSpecialCharsInURL",
    "DegitRatioInURL",
    "DomainLength",
    "NoOfSubDomain",
    "TLDLength",
    "SpacialCharRatioInURL",
]
standard_scale_features = [
    col for col in continuous_features if col not in high_skew_features
]

preprocessor = ColumnTransformer(
    transformers=[
        ("log", FunctionTransformer(np.log1p, validate=True), high_skew_features),
        ("scale", StandardScaler(), standard_scale_features),
    ],
    remainder="passthrough",
)

lasso = LogisticRegression(
    penalty="l1",
    solver="saga",
    C=0.0001,
    max_iter=5000,
    n_jobs=-1,
    random_state=42,
)

pipeline = Pipeline([("preprocessor", preprocessor), ("lasso", lasso)])

print("\nRunning hold-out set evaluation...")
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

pipeline.fit(X_train, y_train)
y_pred_proba = pipeline.predict_proba(X_holdout)[:, 1]
y_pred = pipeline.predict(X_holdout)

holdout_auc = roc_auc_score(y_holdout, y_pred_proba)
holdout_acc = accuracy_score(y_holdout, y_pred)
holdout_f1 = f1_score(y_holdout, y_pred)

print(f"\nHold-out AUC: {holdout_auc:.3f}")
print(f"Hold-out Accuracy: {holdout_acc:.3f}")
print(f"Hold-out F1 Score: {holdout_f1:.3f}")

validate_holdout(X_train, X_holdout)


### SVM - Stochastic Gradient Descent

In [None]:
## SVM
to_scale = standard_scale_features + high_skew_features

preprocessor = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), to_scale),
    ],
    remainder="passthrough",
)

sdg = SGDClassifier(loss='hinge', alpha=0.0001, penalty='l1', random_state=42)
pipeline = Pipeline([("preprocessor", preprocessor), ("sdg", sdg)])

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

pipeline.fit(X_train, y_train)
y_decision = pipeline.decision_function(X_holdout)
y_pred = pipeline.predict(X_holdout)

holdout_auc = roc_auc_score(y_holdout, y_decision)
holdout_acc = accuracy_score(y_holdout, y_pred)
holdout_f1 = f1_score(y_holdout, y_pred)

print(f"\nHold-out AUC: {holdout_auc:.3f}")
print(f"Hold-out Accuracy: {holdout_acc:.3f}")
print(f"Hold-out F1 Score: {holdout_f1:.3f}")

validate_holdout(X_train, X_holdout)