In [1]:
# ==================================================================
#  SINGLE-CELL: NEW MULTINOMIAL NAIVE BAYES MODEL (SEPARATE)
#  Creates:
#      vectorizer_mnb_new.pkl
#      model_mnb_new.pkl
#  Does NOT touch any old .pkl files.
# ==================================================================

import pandas as pd
import pickle
import re
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------------------------------------------
# 1. CONFIG ‚Äì change if your file/columns are different
# ------------------------------------------------------------------
DATA_PATH = "phishing_site_urls.csv"   # your CSV file
TEXT_COL  = "URL"                      # column with URL string
LABEL_COL = "Label"                    # column with 'good'/'bad' or 0/1

def clean_url(url: str) -> str:
    """URL normalization: remove http/https, www, trailing /."""
    if not isinstance(url, str):
        return ""
    u = url.strip()
    u = re.sub(r"^https?://(www\.)?", "", u, flags=re.IGNORECASE)
    return u.rstrip("/")

# ------------------------------------------------------------------
# 2. LOAD + CLEAN DATA
# ------------------------------------------------------------------
print("üì• Loading dataset from:", DATA_PATH)
df = pd.read_csv(DATA_PATH)

# keep only needed columns and drop missing
df = df[[TEXT_COL, LABEL_COL]].dropna()
df[TEXT_COL] = df[TEXT_COL].astype(str)
df["clean_url"] = df[TEXT_COL].apply(clean_url)

X = df["clean_url"]
y = df[LABEL_COL]

print("‚úÖ Dataset shape:", df.shape)
print("‚úÖ Label counts:\n", y.value_counts())

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"üîÄ Train size: {len(X_train)}")
print(f"üîÄ Test size : {len(X_test)}")

# ------------------------------------------------------------------
# 3. PIPELINE: TF-IDF (char) + MNB
# ------------------------------------------------------------------
mnb_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char")),
    ("clf", MultinomialNB())
])

# GridSearch ‚Äì also a bit heavy (good for overnight but not insane)
param_grid_mnb = {
    "tfidf__ngram_range": [(2, 4), (3, 5)],
    "tfidf__min_df": [1, 2],
    "clf__alpha": [0.1, 0.3, 0.5, 1.0],
}

print("\nüöÄ Starting GridSearchCV for MultinomialNB (this may take some time)...")
start = time.time()

grid_mnb = GridSearchCV(
    mnb_pipe,
    param_grid_mnb,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_mnb.fit(X_train, y_train)

end = time.time()
print(f"\n‚è≥ GridSearch finished in {(end - start)/60:.2f} minutes")
print("‚úÖ Best MNB parameters:\n", grid_mnb.best_params_)

best_mnb_pipeline = grid_mnb.best_estimator_

# ------------------------------------------------------------------
# 4. EVALUATION
# ------------------------------------------------------------------
print("\nüìä Evaluating best MNB model on test set...")
y_pred = best_mnb_pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n==============================")
print(f"üéØ NEW MNB TEST ACCURACY: {acc:.4f}")
print("==============================\n")

print("üìÑ Classification Report:\n", classification_report(y_test, y_pred))
print("üìâ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------------
# 5. SAVE SEPARATE PKL FILES
# ------------------------------------------------------------------
best_vec = best_mnb_pipeline.named_steps["tfidf"]
best_clf = best_mnb_pipeline.named_steps["clf"]

with open("vectorizer_mnb_new.pkl", "wb") as f:
    pickle.dump(best_vec, f)

with open("model_mnb_new.pkl", "wb") as f:
    pickle.dump(best_clf, f)

print("\nüíæ Saved new MNB model as:")
print("   vectorizer_mnb_new.pkl")
print("   model_mnb_new.pkl")
print("\nüéâ DONE ‚Äì you can now compare LR (‚âà?, say), MNB (‚âà90%) and RF (‚âà98%) in your report.")


üì• Loading dataset from: phishing_site_urls.csv
‚úÖ Dataset shape: (549346, 3)
‚úÖ Label counts:
 Label
good    392924
bad     156422
Name: count, dtype: int64
üîÄ Train size: 439476
üîÄ Test size : 109870

üöÄ Starting GridSearchCV for MultinomialNB (this may take some time)...
Fitting 3 folds for each of 16 candidates, totalling 48 fits

‚è≥ GridSearch finished in 12.24 minutes
‚úÖ Best MNB parameters:
 {'clf__alpha': 0.1, 'tfidf__min_df': 1, 'tfidf__ngram_range': (3, 5)}

üìä Evaluating best MNB model on test set...

üéØ NEW MNB TEST ACCURACY: 0.9746

üìÑ Classification Report:
               precision    recall  f1-score   support

         bad       0.97      0.94      0.95     31285
        good       0.98      0.99      0.98     78585

    accuracy                           0.97    109870
   macro avg       0.97      0.96      0.97    109870
weighted avg       0.97      0.97      0.97    109870

üìâ Confusion Matrix:
 [[29340  1945]
 [  843 77742]]

üíæ Saved new MNB m