In [4]:

# Import necessary libraries
from pyspark.sql import SparkSession  # For creating a Spark session
from pyspark.ml.feature import VectorAssembler, ChiSqSelector  # For classes for feature engineering
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, NaiveBayes, LinearSVC, GBTClassifier  # For classification algorithms
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator  # For classes for model evaluation
from pyspark.ml import Pipeline  # For Pipeline class for creating a sequence of stages
from pyspark.ml.feature import StringIndexer  # For StringIndexer for indexing categorical target variable

import matplotlib.pyplot as plt  # For visualization


In [5]:

# Creating a Spark session
spark_session = SparkSession.builder.appName("URLClassification").getOrCreate()

# Loading the data
data = spark_session.read.csv("dataset_phishing.csv", header=True, inferSchema=True)

In [1]:


# list of feature columns
feature_cols = ['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 
                'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 
                'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 
                'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 
                'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 
                'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 
                'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 
                'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 
                'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 
                'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 
                'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 
                'nb_hyperlinks', 'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 
                'nb_extCSS', 'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors', 
                'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags', 'submit_email', 
                'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe', 'popup_window', 'safe_anchor', 
                'onmouseover', 'right_clic', 'empty_title', 'domain_in_title', 'domain_with_copyright', 
                'whois_registered_domain', 'domain_registration_length', 'domain_age', 'web_traffic', 
                'dns_record', 'google_index', 'page_rank']

# Target indexer
target_indexer = StringIndexer(inputCol="status", outputCol="label")

# Creating a VectorAssembler for the feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# List of Classifiers
classifiers = {
    "DecisionTree": DecisionTreeClassifier(labelCol="label", featuresCol="features"),
    "RandomForest": RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10),
    "NaiveBayes": NaiveBayes(smoothing=1.0, modelType="gaussian"),
    "GBT": GBTClassifier(labelCol="label", featuresCol="features", maxIter=10),
    "LinearSVC": LinearSVC(labelCol="label", featuresCol="features")
}

# Empty list to store evaluation results
evaluation_results = []

# Everything loop for classifier
for classifier_name, classifier in classifiers.items():
    print(f"Processing {classifier_name} without ChiSqSelector...")

    # Creating a pipeline
    pipeline = Pipeline(stages=[target_indexer, assembler, classifier])

    # Spliting the data into training and testing sets
    train_data, test_data = data.randomSplit([0.7, 0.3], seed=123)

    # Training the model
    model = pipeline.fit(train_data)

    # Making predictions on the test data
    predictions = model.transform(test_data)

    # Evaluating the model using different metrics
    evaluator_roc = BinaryClassificationEvaluator(labelCol="label")
    auc = round(evaluator_roc.evaluate(predictions), 2)

    evaluator_pr = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderPR")
    auprc = round(evaluator_pr.evaluate(predictions), 2)

    evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
    accuracy = round(eval uator_acc.evaluate(predictions), 2)

    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")
    f1 = round(evaluator_f1.evaluate(predictions), 2)

    # Appending the evaluation results to the list
    evaluation_results.append((classifier_name, "Without ChiSqSelector", auc, auprc, accuracy, f1))

# Loop through each classifier again
for classifier_name, classifier in classifiers.items():
    print(f"Processing {classifier_name} with ChiSqSelector...")

    # Creating a pipeline with ChiSqSelector
    selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features")
    pipeline = Pipeline(stages=[target_indexer, assembler, selector, classifier])

    # Spliting the data into training and testing sets
    train_data, test_data = data.randomSplit([0.7, 0.3], seed=123)

    # Training the model
    model = pipeline.fit(train_data)

    # Save the RandomForest model
    # if classifier_name == "RandomForest":
    #     rf_classifier = classifier
    #     pipeline_rf = Pipeline(stages=[target_indexer, assembler, rf_classifier])
    #     model_rf = pipeline_rf.fit(train_data)
    #     model_rf.save("random_forest_model")

    # Making predictions on the test data
    predictions = model.transform(test_data)

    # Evaluating the model using different metrics
    evaluator_roc = BinaryClassificationEvaluator(labelCol="label")
    auc = round(evaluator_roc.evaluate(predictions), 2)

    evaluator_pr = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderPR")
    auprc = round(evaluator_pr.evaluate(predictions), 2)

    evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
    accuracy = round(evaluator_acc.evaluate(predictions), 2)

    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")
    f1 = round(evaluator_f1.evaluate(predictions), 2)

    # Appending the evaluation results to the list
    evaluation_results.append((classifier_name, "With ChiSqSelector", auc, auprc, accuracy, f1))

    # Creating a MulticlassClassificationEvaluator
    evaluator_multiclass = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

    # Calculating confusion matrix
    confusion_matrix = predictions.groupBy("label").pivot("prediction").count().na.fill(0).orderBy("label")
    print(f"Confusion matrix for {classifier_name}:\n{confusion_matrix}")

# Creating a DataFrame from the list of evaluation results
summary_schema = ["Classifier", "Feature Selection", "AUC", "AUPRC", "Accuracy", "F1-score"]
summary_df = spark_session.createDataFrame(evaluation_results, summary_schema)

# Showing the summary table
summary_df.show()




23/08/30 21:24:30 WARN Utils: Your hostname, MDUs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.18.2 instead (on interface en0)
23/08/30 21:24:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/30 21:24:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Processing DecisionTree without ChiSqSelector...


23/08/30 21:24:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Processing RandomForest without ChiSqSelector...




CodeCache: size=131072Kb used=42612Kb max_used=42863Kb free=88459Kb
 bounds [0x000000010421c000, 0x0000000106c4c000, 0x000000010c21c000]
 total_blobs=14970 nmethods=13911 adapters=970
 compilation: disabled (not enough contiguous free space left)


23/08/30 21:24:43 WARN DAGScheduler: Broadcasting large task binary with size 1048.8 KiB
23/08/30 21:24:43 WARN DAGScheduler: Broadcasting large task binary with size 1665.8 KiB
23/08/30 21:24:44 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/08/30 21:24:45 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
23/08/30 21:24:46 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
23/08/30 21:24:47 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:24:48 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:24:48 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:24:49 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB


Processing NaiveBayes without ChiSqSelector...


23/08/30 21:24:51 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Processing GBT without ChiSqSelector...
Processing LinearSVC without ChiSqSelector...
Processing DecisionTree with ChiSqSelector...
Confusion matrix for DecisionTree:
DataFrame[label: double, 0.0: bigint, 1.0: bigint]
Processing RandomForest with ChiSqSelector...


23/08/30 21:25:08 WARN DAGScheduler: Broadcasting large task binary with size 1048.8 KiB
23/08/30 21:25:09 WARN DAGScheduler: Broadcasting large task binary with size 1665.8 KiB
23/08/30 21:25:09 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/08/30 21:25:10 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
23/08/30 21:25:11 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
23/08/30 21:25:12 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:25:13 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:25:13 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:25:14 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
23/08/30 21:25:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
23/08/30 21:25:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
23/08/30 21:25:14 WARN DAGScheduler: Broadcastin

Confusion matrix for RandomForest:
DataFrame[label: double, 0.0: bigint, 1.0: bigint]
Processing NaiveBayes with ChiSqSelector...
Confusion matrix for NaiveBayes:
DataFrame[label: double, 0.0: bigint, 1.0: bigint]
Processing GBT with ChiSqSelector...
Confusion matrix for GBT:
DataFrame[label: double, 0.0: bigint, 1.0: bigint]
Processing LinearSVC with ChiSqSelector...
Confusion matrix for LinearSVC:
DataFrame[label: double, 0.0: bigint, 1.0: bigint]
+------------+--------------------+----+-----+--------+--------+
|  Classifier|   Feature Selection| AUC|AUPRC|Accuracy|F1-score|
+------------+--------------------+----+-----+--------+--------+
|DecisionTree|Without ChiSqSele...| 0.9| 0.93|    0.92|    0.92|
|RandomForest|Without ChiSqSele...|0.99| 0.99|    0.96|    0.96|
|  NaiveBayes|Without ChiSqSele...|0.65| 0.68|    0.76|    0.76|
|         GBT|Without ChiSqSele...|0.98| 0.98|    0.93|    0.93|
|   LinearSVC|Without ChiSqSele...|0.98| 0.98|    0.94|    0.94|
|DecisionTree|  With ChiSq

 - RandomForest classifier is performing the best among the tested classifiers with or without feature selection.
 - With and without feature selection the result are same


In [3]:

# Print confusion matrices for all classifiers in a tabular format
print("\nConfusion Matrices:")
for classifier_name, classifier in classifiers.items():
    print(f"\nClassifier: {classifier_name}")
    confusion_matrix = predictions.groupBy("label").pivot("prediction").count().na.fill(0).orderBy("label")
    
    confusion_matrix.show()

# # Stop the Spark session
spark_session.stop()


Confusion Matrices:

Classifier: DecisionTree
+-----+----+----+
|label| 0.0| 1.0|
+-----+----+----+
|  0.0|1634| 110|
|  1.0|  95|1673|
+-----+----+----+


Classifier: RandomForest
+-----+----+----+
|label| 0.0| 1.0|
+-----+----+----+
|  0.0|1634| 110|
|  1.0|  95|1673|
+-----+----+----+


Classifier: NaiveBayes
+-----+----+----+
|label| 0.0| 1.0|
+-----+----+----+
|  0.0|1634| 110|
|  1.0|  95|1673|
+-----+----+----+


Classifier: GBT
+-----+----+----+
|label| 0.0| 1.0|
+-----+----+----+
|  0.0|1634| 110|
|  1.0|  95|1673|
+-----+----+----+


Classifier: LinearSVC
+-----+----+----+
|label| 0.0| 1.0|
+-----+----+----+
|  0.0|1634| 110|
|  1.0|  95|1673|
+-----+----+----+



- As all model have confusion matrix is also same