In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

from preprocess import extract_features

In [None]:
def main():
    # 1. Load data
    df = pd.read_csv("../data/phishing_site_urls.csv")

    # 2. Preprocess and feature engineering
    df = extract_features(df)

    # 3. Labels
    y = df["Label"].map({"bad": 1, "good": 0})

    # 4. TF-IDF on cleaned URLs
    vectorizer = TfidfVectorizer(
        analyzer="char",
        ngram_range=(2, 5),
        max_features=50000
    )
    X_tfidf = vectorizer.fit_transform(df["clean_url"])

    # 5. Handcrafted features
    handcrafted_features = df[
        [
            "url_length",
            "num_special_chars",
            "num_digits",
            "has_ip",
            "num_subdomains",
            "has_suspicious_word",
        ]
    ].values

    # 6. Combine features
    X = hstack([X_tfidf, handcrafted_features])
    # 7. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 8. Train model
    model = LogisticRegression(max_iter=3000, n_jobs=-1)
    model.fit(X_train, y_train)

    # 9. Predictions
    y_pred = model.predict(X)

    # 10. Evaluation
    print("Classification Report:\n")
    print(classification_report(y, y_pred))

    # 11. Save model and vectorizer
    with open("../model/model.pkl", "wb") as f:
        pickle.dump(model, f)

    with open("../model/vectorizer.pkl", "wb") as f:
        pickle.dump(vectorizer, f)

    print("Training completed. Model and vectorizer saved.")


if __name__ == "__main__":
    main()

In [None]:
# 11. Save model and vectorizer
    with open("../model/model.pkl", "wb") as f:
        pickle.dump(model, f)

    with open("../model/vectorizer.pkl", "wb") as f:
        pickle.dump(vectorizer, f)

    print("Training completed. Model and vectorizer saved.")