In [13]:
#Khai báo các thư viện để xử lý dữ liệu
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, max, min, sum, log, log10, log2
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, ChiSqSelector, PCA
from pyspark.ml.classification import LinearSVC, RandomForestClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import sqlite3
import pandas as pd
import ldata

In [14]:
#Load dữ liệu arff sang pandas Dataframe
file_path= '/root/NSLKDDProject/nslkdd/KDDTrain+.txt'
# Khởi tạo Spark session
spark = SparkSession.builder.appName("NSLKDDApp2").getOrCreate()
rdd = spark.sparkContext.textFile(file_path)
rdd_split = rdd.map(lambda line: line.split(","))
header = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate' ,
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'class', 'level'
]
spark_df = rdd_split.toDF(header)
#Load DataFrame
spark_df.show()
total_records = spark_df.count()
print(f"Tổng số lượng record: {total_records}")
#Đếm số lượng record của mỗi class

+--------+-------------+----------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-----------+-----+
|duration|protocol_type|   service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_h

                                                                                

In [5]:
# Mô tả dữ liệu (statstics cho mỗi cột)
# count: Kiểm tra giá trị không null
# mean: Giá trị trung bình
# stddev: Độ lệch chuẩn
# min: giá trị lớn nhất
# max: Giá trị nhỏ nhất
description = spark_df.describe()
description.show()
spark_df.printSchema()
column_types = spark_df.dtypes
# Print the data type    
type_counts = {}
for _, dtype in column_types:
    if dtype in type_counts:
        type_counts[dtype] += 1
    else:
        type_counts[dtype] = 1
print(type_counts)
# Đếm giá trị các trường dữ liệu
#for dtype, count in type_counts.items():
#    print(f"Data Type: {dtype}, Count: {count}")
#print("")  
#for column, dtype in column_types:
#    if (dtype == 'string'):
#        print(f"Column: {column}, Type: {dtype}")


                                                                                

+-------+------------------+-------------+-------+------+-----------------+------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-----------+------------------+
|summary|          duration|protocol_type|service|  flag|        src_bytes|         dst_bytes|                land|      wrong_fragment|  

In [None]:
#Xem xét một số trường dữ liệu 
#Ví dụ về protocol_type
protocol_counts = spark_df.groupBy("protocol_type").count()
protocol_counts.show()
service_counts = spark_df.groupBy("service").count()
service_counts.show()
flag_counts = spark_df.groupBy("flag").count()
flag_counts.show()
#service_percentage = service_counts.withColumn("Percentage", (col("service") / total_records) * 100)
#service_percentage.show()

#Kiểm tra giá trị null trên dữ liệu
null_counts = {}
for checkcol in spark_df.columns:
    null_count = spark_df.filter(spark_df[checkcol].isNull()).count()
    null_counts[checkcol] = null_count
print(null_counts)

In [None]:
protocol_type_df = spark_df.select("protocol_type")
indexer = StringIndexer(inputCol="protocol_type", outputCol="protocol_type_onehot")
protocol_type_df = indexer.fit(protocol_type_df).transform(protocol_type_df)
class_df = spark_df.select("class")
class_df = class_df.withColumn("classbinary", when(col("class")== "normal", "0").otherwise("1"))
class_df = class_df.drop("class").withColumnRenamed("classbinary", "class")
class_df = class_df.withColumn("class", col("class").cast(DoubleType()))
class_df.show()
column_types1 = class_df.dtypes
print(column_types1)

In [15]:
#drop columns with zero value and level tag
spark_df = spark_df.drop("num_outbound_cmds")
spark_df = spark_df.drop("level")
# THỰC HIỆN PREPROCESSING DATA BAO GỒM
#1. One hot encoding data string dựa trên tần suất xuất hiện của value trong data
translist = ["protocol_type", "service", "flag"]
translist_temp = ["protocol_type_trans", "service_trans", "flag_trans"]
for i in range(len(translist)):
    indexer = StringIndexer(inputCol=translist[i], outputCol=translist_temp[i])
    spark_df = indexer.fit(spark_df).transform(spark_df)
    spark_df = spark_df.drop(translist[i]).withColumnRenamed(translist_temp[i], translist[i])
#2 Chuyển đổi label data thành 0,1 
#class_name = {"normal": "0", "anomaly": "1"}
#spark_df = spark_df.replace(class_name, subset=["class"])
spark_df = spark_df.withColumn("classbinary", when(col("class")== "normal", "0").otherwise("1"))
spark_df = spark_df.drop("class").withColumnRenamed("classbinary", "class")
spark_df = spark_df.withColumn("class", col("class").cast(DoubleType()))
class_counts = spark_df.groupBy("class").count()
class_counts.show()
#3. Chuyển đổi các cột string cast về double
#castlistvalue = ["land", "logged_in", "is_host_login", "is_guest_login"]
column_types1 = spark_df.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types1:
    if (column == "class"):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
for i in range (len(inputColumns)):
    spark_df = spark_df.withColumn(inputColumns[i], col(inputColumns[i]).cast(DoubleType()))

#4 Logarithmic scaling method dành cho các trường dữ liệu có độ chênh lệch lớn giữa max và min value
#loglist = ["duration", "src_bytes", "dst_bytes", "num_compromised","num_root"]
#loglist_temp = ["duration_trans", "src_bytes_trans", "dst_bytes_trans", "num_compromised_trans","num_root_trans"]
loglist = ["duration", "src_bytes", "dst_bytes"]
loglist_temp = ["duration_trans", "src_bytes_trans", "dst_bytes_trans"]
for i in range(len(loglist)):
    #spark_df = spark_df.withColumn(loglist_temp[i], log2(loglist[i]))
    spark_df = spark_df.withColumn(loglist_temp[i],when(col(loglist[i]) > 0, log(loglist[i])).otherwise(0))
    spark_df = spark_df.drop(loglist[i]).withColumnRenamed(loglist_temp[i], loglist[i])

#class_df = spark_df.select("class")
#spark_df = spark_df.drop("class")
#5 Normalize Data
print("Sau khi scalling")
spark_df.show()
columntypes2 = spark_df.dtypes
for column, dtype in columntypes2:
    sum_value = spark_df.agg(sum(col(column))).first()[0]
    if (column == "class"):
        continue
    elif (column in translist):
        continue
    else:
        min_value = spark_df.select(min(col(column))).first()[0]
        max_value = spark_df.select(max(col(column))).first()[0]
        spark_df = spark_df.withColumn(column,(col(column) - min_value) / (max_value - min_value))
spark_df.show()

                                                                                

+-----+-----+
|class|count|
+-----+-----+
|  0.0|67343|
|  1.0|58630|
+-----+-----+

['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type', 'service', 'flag']
Sau khi scalling
+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-------------+--------------+-----

                                                                                

+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-------------+--------------+--------------------+--------------------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+-------------------+--------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------------+-------+----+-----+--------+-------------------+-------------------+
|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|is_host_login|is_guest_login|               count|           srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|     dst_host_count|  

In [None]:
#Kiểm tra dữ liệu sau khi thực hiện tiền xử lý trước khi đưa vào học máy
#spark_df.show()
column_types = spark_df.dtypes
type_counts = {}
for _, dtype in column_types:
    if dtype in type_counts:
        type_counts[dtype] += 1
    else:
        type_counts[dtype] = 1
print("Ket qua")
print(type_counts)

null_counts = {}
for checkcol in spark_df.columns:
    null_count = spark_df.filter(spark_df[checkcol].isNull()).count()
    null_counts[checkcol] = null_count
print(null_counts)
description_after = spark_df.describe()
description_after.show()

In [16]:
#Training SVM model
#Gộp các cột thành 1 vector cột
column_types2 = spark_df.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types2:
    if (column == "class"):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
#print(outputCols)

assembler = VectorAssembler(inputCols=inputColumns, outputCol= "features")
#df_assembled = assembler.transform(spark_df).select("class", "features")
df_assembled = assembler.transform(spark_df)
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pca_model = pca.fit(df_assembled)
result_df = pca_model.transform(df_assembled).select("class", "pcaFeatures")

#selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="class")
#selectorModel = selector.fit(df_assembled)
#df_selected = selectorModel.transform(df_assembled)
# Prepare DataFrame for model training
#df_final = df_selected.select("selected_features", "class")
#df_final.show()

train_data, test_data = result_df.randomSplit([0.8, 0.2], seed=1234)

svm = LinearSVC(maxIter=10, regParam=0.1, labelCol="class", featuresCol="pcaFeatures")
cvModel = svm.fit(train_data)
predictions = cvModel.transform(test_data)
#svm = LinearSVC(labelCol="class", featuresCol="selected_features")
#paramGrid = ParamGridBuilder() \
#   .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
#    .addGrid(svm.maxIter, [10, 50, 100]) \
#    .build()
#evaluator = BinaryClassificationEvaluator(labelCol="class", rawPredictionCol="rawPrediction")
#crossval = CrossValidator(estimator=svm,
#                          estimatorParamMaps=paramGrid,
#                          evaluator=evaluator,
#                          numFolds=5)
#cvModel = crossval.fit(train_data)
#cvModel = crossval.fit(df_assembled)
#predictions = cvModel.transform(test_data)

#predictions.select("class", "features", "prediction").show()
predictions.select("class", "pcaFeatures", "prediction").show()
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Do chinh xac tren tap du lieu mau = {accuracy}")


['land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type', 'service', 'flag', 'duration', 'src_bytes', 'dst_bytes']


                                                                                

+-----+--------------------+----------+
|class|         pcaFeatures|prediction|
+-----+--------------------+----------+
|  0.0|[-62.884751716133...|       0.0|
|  0.0|[-61.889847162798...|       0.0|
|  0.0|[-57.962509069847...|       1.0|
|  0.0|[-57.897852397967...|       0.0|
|  0.0|[-57.896147390043...|       0.0|
|  0.0|[-57.894841262267...|       0.0|
|  0.0|[-57.893258173343...|       0.0|
|  0.0|[-57.883017159515...|       0.0|
|  0.0|[-57.866889628091...|       0.0|
|  0.0|[-57.864866690619...|       0.0|
|  0.0|[-53.923648537702...|       0.0|
|  0.0|[-53.915730948309...|       0.0|
|  0.0|[-53.913845623127...|       0.0|
|  0.0|[-53.909229075963...|       0.0|
|  0.0|[-53.906412943750...|       0.0|
|  0.0|[-53.904903297825...|       0.0|
|  0.0|[-53.901846030751...|       0.0|
|  0.0|[-53.901762966557...|       0.0|
|  0.0|[-53.898787663242...|       0.0|
|  0.0|[-53.896946690694...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



[Stage 1280:>                                                       (0 + 2) / 2]

Do chinh xac tren tap du lieu mau = 0.8786338363780778


                                                                                

In [None]:
#Training SVM model
#Gộp các cột thành 1 vector cột
column_types2 = spark_df.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types2:
    if (column == "class"):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
#print(outputCols)

assembler = VectorAssembler(inputCols=inputColumns, outputCol= "features")
#df_assembled = assembler.transform(spark_df).select("class", "features")
df_assembled = assembler.transform(spark_df)
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="class")
selectorModel = selector.fit(df_assembled)
df_selected = selectorModel.transform(df_assembled)
# Prepare DataFrame for model training
df_final = df_selected.select("selected_features", "class")
df_final.show()

train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=1234)

#svm = LinearSVC(maxIter=10, regParam=0.1, labelCol="class", featuresCol="selected_features")
#cvModel = svm.fit(train_data)
#predictions = cvModel.transform(test_data)
svm = LinearSVC(labelCol="class", featuresCol="selected_features")
paramGrid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(svm.maxIter, [10, 50, 100]) \
    .build()
evaluator = BinaryClassificationEvaluator(labelCol="class", rawPredictionCol="rawPrediction")
crossval = CrossValidator(estimator=svm,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
cvModel = crossval.fit(train_data)
#cvModel = crossval.fit(df_assembled)
predictions = cvModel.transform(test_data)

#predictions.select("class", "features", "prediction").show()
predictions.select("class", "selected_features", "prediction").show()
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Do chinh xac tren tap du lieu mau = {accuracy}")


In [19]:
#Sau khi training model xong, thuc hien tren data tu file Test
file_path= '/root/NSLKDDProject/nslkdd/KDDTest+.txt'
# Khởi tạo Spark session
rdd_test = spark.sparkContext.textFile(file_path)
rdd_split_test = rdd_test.map(lambda line: line.split(","))
header = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate' ,
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'class', 'level'
]
spark_df_test = rdd_split_test.toDF(header)
#Load DataFrame
spark_df_test.show()
total_records = spark_df_test.count()
print(f"Tổng số lượng record: {total_records}")
#Đếm số lượng record của mỗi class
class_counts = spark_df_test.groupBy("class").count()
class_counts.show()

+--------+-------------+--------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+------------+-----+
|duration|protocol_type| service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host

In [20]:
#drop columns with zero value and level tag
spark_df_test = spark_df_test.drop("num_outbound_cmds")
spark_df_test = spark_df_test.drop("level")
# THỰC HIỆN PREPROCESSING DATA BAO GỒM
#1. One hot encoding data string dựa trên tần suất xuất hiện của value trong data
translist = ["protocol_type", "service", "flag"]
translist_temp = ["protocol_type_trans", "service_trans", "flag_trans"]
for i in range(len(translist)):
    indexer = StringIndexer(inputCol=translist[i], outputCol=translist_temp[i])
    spark_df_test = indexer.fit(spark_df_test).transform(spark_df_test)
    spark_df_test = spark_df_test.drop(translist[i]).withColumnRenamed(translist_temp[i], translist[i])
#2 Chuyển đổi label data thành 0,1 
#class_name = {"normal": "0", "anomaly": "1"}
#spark_df = spark_df.replace(class_name, subset=["class"])
spark_df_test = spark_df_test.withColumn("classbinary", when(col("class")== "normal", "0").otherwise("1"))
spark_df_test = spark_df_test.drop("class").withColumnRenamed("classbinary", "class")
spark_df_test = spark_df_test.withColumn("class", col("class").cast(DoubleType()))
class_counts = spark_df_test.groupBy("class").count()
class_counts.show()
#3. Chuyển đổi các cột string cast về double
#castlistvalue = ["land", "logged_in", "is_host_login", "is_guest_login"]
column_types1 = spark_df_test.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types1:
    if (column == "class"):
        continue
    elif (column in translist):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
for i in range (len(inputColumns)):
    spark_df_test = spark_df_test.withColumn(inputColumns[i], col(inputColumns[i]).cast(DoubleType()))

#4 Logarithmic scaling method dành cho các trường dữ liệu có độ chênh lệch lớn giữa max và min value
#loglist = ["duration", "src_bytes", "dst_bytes", "num_compromised","num_root"]
#loglist_temp = ["duration_trans", "src_bytes_trans", "dst_bytes_trans", "num_compromised_trans","num_root_trans"]
loglist = ["duration", "src_bytes", "dst_bytes"]
loglist_temp = ["duration_trans", "src_bytes_trans", "dst_bytes_trans"]
for i in range(len(loglist)):
    #spark_df = spark_df.withColumn(loglist_temp[i], log2(loglist[i]))
    spark_df_test = spark_df_test.withColumn(loglist_temp[i],when(col(loglist[i]) > 0, log(loglist[i])).otherwise(0))
    spark_df_test = spark_df_test.drop(loglist[i]).withColumnRenamed(loglist_temp[i], loglist[i])

#class_df = spark_df.select("class")
#spark_df = spark_df.drop("class")
#5 Normalize Data
print("Sau khi scalling")
spark_df_test.show()
columntypes2 = spark_df_test.dtypes
for column, dtype in columntypes2:
    sum_value = spark_df_test.agg(sum(col(column))).first()[0]
    if (sum_value == 0):
        print(column)
    elif (column == "class"):
        continue
    elif (column in translist):
        continue
    else:
        min_value = spark_df_test.select(min(col(column))).first()[0]
        max_value = spark_df_test.select(max(col(column))).first()[0]
        spark_df = spark_df_test.withColumn(column,(col(column) - min_value) / (max_value - min_value))
spark_df.show()

+-----+-----+
|class|count|
+-----+-----+
|  0.0| 9711|
|  1.0|12833|
+-----+-----+

['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
Sau khi scalling
+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-------------+--------------+-----+---------+-----------+-------------

In [21]:
des = spark_df_test.describe()
des.show()
column_types1 = spark_df_test.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types1:
    if (column == "class"):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
#print(outputCols)

#df_test_assembled = assembler.transform(spark_df_test).select("class", "features")
df_test_assembled = assembler.transform(spark_df_test)
#df_test_selected = selectorModel.transform(df_test_assembled)

# Prepare DataFrame for making predictions
#df_test_final = df_test_selected.select("selected_features", "class")
df_assembled = assembler.transform(spark_df_test)
df_test_final = pca_model.transform(df_assembled).select("class", "pcaFeatures")

predictions = cvModel.transform(df_test_final)
#predictions = cvModel.transform(df_test_assembled)
#predictions.select("class", "features", "prediction").show()
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Do chinh xac tren tap du lieu test = {accuracy}")
truep = predictions.filter((col("class") == 1) & (col("prediction") == 1)).count()
truen = predictions.filter((col("class") == 0) & (col("prediction") == 0)).count()
falsep = predictions.filter((col("class") == 0) & (col("prediction") == 1)).count()
falsen = predictions.filter((col("class") == 1) & (col("prediction") == 0)).count()
print(f"True Positives: {truep}")
print(f"True Negatives: {truen}")
print(f"False Positives: {falsep}")
print(f"False Negatives: {falsen}")

                                                                                

+-------+--------------------+--------------------+-------------------+-------------------+--------------------+------------------+-------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-----------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|                land|      wrong_fragment|             urgent|                hot|   num_failed_logins|         logged_in|    num_comp

In [None]:
des = spark_df_test.describe()
des.show()
column_types1 = spark_df_test.dtypes
inputColumns = []
#outputCols = []
for column, dtype in column_types1:
    if (column == "class"):
        continue
    else:
        inputColumns.append(column)
print(inputColumns)
#print(outputCols)

#df_test_assembled = assembler.transform(spark_df_test).select("class", "features")
df_test_assembled = assembler.transform(spark_df_test)
df_test_selected = selectorModel.transform(df_test_assembled)

# Prepare DataFrame for making predictions
df_test_final = df_test_selected.select("selected_features", "class")

predictions = cvModel.transform(df_test_final)
#predictions = cvModel.transform(df_test_assembled)
#predictions.select("class", "features", "prediction").show()
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Do chinh xac tren tap du lieu test = {accuracy}")
truep = predictions.filter((col("class") == 1) & (col("prediction") == 1)).count()
truen = predictions.filter((col("class") == 0) & (col("prediction") == 0)).count()
falsep = predictions.filter((col("class") == 0) & (col("prediction") == 1)).count()
falsen = predictions.filter((col("class") == 1) & (col("prediction") == 0)).count()
print(f"True Positives: {truep}")
print(f"True Negatives: {truen}")
print(f"False Positives: {falsep}")
print(f"False Negatives: {falsen}")