# Intrusion Detection based Anomaly method using Classification algorithms 
- Decision Tree, 

- Random Forest Tree, 

- Gradient Boost Tree, 

- Naive Bayes 

- Logistic Regression

### Importing Packages and configuring spark engine

In [None]:
!pip install pyspark



In [None]:
import pyspark.sql.functions as funcs
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromCsv")\
.config("spark.driver.memory","3g")\
.config("spark.executor.memory", "4g")\
.getOrCreate()

In [None]:
'''logger = spark.sparkContext._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'''

'logger = spark.sparkContext._jvm.org.apache.log4j\nlogger.LogManager.getLogger("org"). setLevel(logger.Level.ERROR)\nlogger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)'

# 1. Load Dataset

In [None]:
iris = spark.read \
.format("csv")\
.option("header", True)\
.option("sep", ",")\
.option("inferSchema", "True")\
.load("UNSW_NB15.csv")

In [None]:
iris.printSchema()

root
 |-- id: integer (nullable = true)
 |-- dur: double (nullable = true)
 |-- proto: string (nullable = true)
 |-- service: string (nullable = true)
 |-- state: string (nullable = true)
 |-- spkts: integer (nullable = true)
 |-- dpkts: integer (nullable = true)
 |-- sbytes: integer (nullable = true)
 |-- dbytes: integer (nullable = true)
 |-- rate: double (nullable = true)
 |-- sttl: integer (nullable = true)
 |-- dttl: integer (nullable = true)
 |-- sload: double (nullable = true)
 |-- dload: double (nullable = true)
 |-- sloss: integer (nullable = true)
 |-- dloss: integer (nullable = true)
 |-- sinpkt: double (nullable = true)
 |-- dinpkt: double (nullable = true)
 |-- sjit: double (nullable = true)
 |-- djit: double (nullable = true)
 |-- swin: integer (nullable = true)
 |-- stcpb: long (nullable = true)
 |-- dtcpb: long (nullable = true)
 |-- dwin: integer (nullable = true)
 |-- tcprtt: double (nullable = true)
 |-- synack: double (nullable = true)
 |-- ackdat: double (nullable 

# 2. Data Preparation 

In [None]:
#Just for UNSW_NB15 dataset
iris=iris.drop('proto','service','state','label')

In [None]:
iris.printSchema()

root
 |-- id: integer (nullable = true)
 |-- dur: double (nullable = true)
 |-- spkts: integer (nullable = true)
 |-- dpkts: integer (nullable = true)
 |-- sbytes: integer (nullable = true)
 |-- dbytes: integer (nullable = true)
 |-- rate: double (nullable = true)
 |-- sttl: integer (nullable = true)
 |-- dttl: integer (nullable = true)
 |-- sload: double (nullable = true)
 |-- dload: double (nullable = true)
 |-- sloss: integer (nullable = true)
 |-- dloss: integer (nullable = true)
 |-- sinpkt: double (nullable = true)
 |-- dinpkt: double (nullable = true)
 |-- sjit: double (nullable = true)
 |-- djit: double (nullable = true)
 |-- swin: integer (nullable = true)
 |-- stcpb: long (nullable = true)
 |-- dtcpb: long (nullable = true)
 |-- dwin: integer (nullable = true)
 |-- tcprtt: double (nullable = true)
 |-- synack: double (nullable = true)
 |-- ackdat: double (nullable = true)
 |-- smean: integer (nullable = true)
 |-- dmean: integer (nullable = true)
 |-- trans_depth: integer (nu

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [None]:
feature_cols = iris.columns[:-1]
#feature_cols = iris.columns[:-1]


Pipelining and Vector assembler

In [None]:
# attack_cat for UNSW_NB15 and status for NSL KDD
label_indexer = StringIndexer(inputCol = "attack_cat", outputCol = "label")
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
pipe = Pipeline(stages=[assembler, label_indexer])
pipe_model = pipe.fit(iris)

In [None]:
data1 = pipe_model.transform(iris)
data = data1.select("features","label")

In [None]:
train, test = data.randomSplit([0.70, 0.30])

In [None]:
train.head(3)

[Row(features=SparseVector(40, {0: 971.0, 1: 0.0632, 2: 4.0, 3: 4.0, 4: 2304.0, 5: 2304.0, 6: 110.8226, 7: 62.0, 8: 252.0, 9: 218858.8281, 10: 218858.8281, 13: 15.266, 14: 13.1533, 15: 21.5759, 16: 18.5903, 24: 576.0, 25: 576.0, 28: 2.0, 29: 3.0, 30: 2.0, 31: 2.0, 32: 2.0, 33: 2.0, 37: 2.0, 38: 2.0}), label=2.0),
 Row(features=SparseVector(40, {0: 3470.0, 1: 0.0632, 2: 4.0, 3: 4.0, 4: 2304.0, 5: 2304.0, 6: 110.8226, 7: 62.0, 8: 252.0, 9: 218858.8281, 10: 218858.8281, 13: 15.266, 14: 13.1533, 15: 21.5759, 16: 18.5903, 24: 576.0, 25: 576.0, 28: 2.0, 29: 3.0, 30: 2.0, 31: 2.0, 32: 2.0, 33: 2.0, 37: 2.0, 38: 2.0}), label=2.0),
 Row(features=SparseVector(40, {0: 11185.0, 1: 0.0591, 2: 4.0, 3: 4.0, 4: 2304.0, 5: 2304.0, 6: 118.5336, 7: 62.0, 8: 252.0, 9: 234086.8594, 10: 234086.8594, 13: 14.6873, 14: 11.1043, 15: 20.7611, 16: 15.6947, 24: 576.0, 25: 576.0, 28: 1.0, 29: 3.0, 30: 2.0, 31: 2.0, 32: 2.0, 33: 2.0, 37: 2.0, 38: 1.0}), label=4.0)]

# 3. Train Model

# Neural Network

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [None]:
#clf = MultilayerPerceptronClassifier(random_state=1, max_iter=300).fit(train)
#clf.predict_proba(test)
#clf.score(test)


### 3.1 Decision Tree Algorithm

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

#### 3.1.1 Training and Predicting of Model

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
modeldt = dt.fit(train)
predictiondt = modeldt.transform(test)
predictiondt.toPandas().head()

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(5990.0, 0.127695, 4.0, 4.0, 1954.0, 2170.0, 5...",4.0,"[17.0, 27.0, 904.0, 201.0, 1023.0, 120.0, 81.0...","[0.007013201320132013, 0.011138613861386138, 0...",4.0
1,"(8454.0, 1.375599, 4.0, 4.0, 1676.0, 2776.0, 5...",2.0,"[17.0, 27.0, 904.0, 201.0, 1023.0, 120.0, 81.0...","[0.007013201320132013, 0.011138613861386138, 0...",4.0
2,"(8786.0, 17.625374, 250.0, 24.0, 248562.0, 165...",2.0,"[17.0, 27.0, 904.0, 201.0, 1023.0, 120.0, 81.0...","[0.007013201320132013, 0.011138613861386138, 0...",4.0
3,"(8788.0, 17.625374, 250.0, 24.0, 248562.0, 165...",2.0,"[17.0, 27.0, 904.0, 201.0, 1023.0, 120.0, 81.0...","[0.007013201320132013, 0.011138613861386138, 0...",4.0
4,"(8998.0, 17.625374, 250.0, 24.0, 248562.0, 165...",5.0,"[17.0, 27.0, 904.0, 201.0, 1023.0, 120.0, 81.0...","[0.007013201320132013, 0.011138613861386138, 0...",4.0


#### 3.1.2 Confusion Matrix of Decision Tree

In [None]:
predictiondt.select("prediction", "label")\
.groupBy("prediction", "label").count()\
.orderBy("prediction", "label", ascending=True).withColumn("attack_cat",
funcs.when(funcs.col("label").isin(1), "Anomaly")\
.otherwise("Normal")).toPandas().head()

Unnamed: 0,prediction,label,count,attack_cat
0,0.0,0.0,10731,Normal
1,0.0,1.0,4,Anomaly
2,0.0,2.0,6,Normal
3,1.0,1.0,5165,Anomaly
4,1.0,2.0,1,Normal


In [None]:
predictiondt.groupBy(["label","prediction"]).count().toPandas().head()

Unnamed: 0,label,prediction,count
0,8.0,3.0,71
1,2.0,0.0,6
2,7.0,3.0,61
3,3.0,5.0,10
4,9.0,5.0,1


#### 3.1.3 Calculation of Accuracy

In [None]:
evaluatordt = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dt = evaluatordt.evaluate(predictiondt)

print("--- Decision Tree --- ")
print("Accuracy Rate =", round(dt,4))
print("  Error  Rate = %g " % round((1.0 - dt),4))

--- Decision Tree --- 
Accuracy Rate = 0.8291
  Error  Rate = 0.1709 


In [None]:
predictionAndLabel = predictiondt.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+------+------+-----+-----+---+---+---+---+
| normal|anomaly|    _3|    _4|   _5|   _6| _7| _8| _9|_10|
+-------+-------+------+------+-----+-----+---+---+---+---+
|10731.0|    0.0|  47.0| 233.0| 10.0|  2.0|0.0|0.0|0.0|0.0|
|    4.0| 5165.0|  81.0| 352.0|  9.0|  0.0|0.0|0.0|0.0|0.0|
|    6.0|    1.0|2107.0| 533.0|615.0|107.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0| 273.0|1420.0|108.0| 10.0|0.0|0.0|0.0|0.0|
|    0.0|    1.0| 340.0| 170.0|655.0| 55.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0|  23.0| 593.0| 95.0|375.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0| 129.0|  47.0| 43.0|  0.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0| 118.0|  61.0| 17.0|  3.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0|   4.0|  71.0| 13.0| 29.0|0.0|0.0|0.0|0.0|
|    0.0|    0.0|   3.0|   9.0|  0.0|  1.0|0.0|0.0|0.0|0.0|
+-------+-------+------+------+-----+-----+---+---+---+---+



In [None]:
predictiondt.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+----+----+----+---+---+---+---+---+---+
|prediction_label|  0.0| 1.0| 2.0| 3.0|4.0|5.0|6.0|7.0|8.0|9.0|
+----------------+-----+----+----+----+---+---+---+---+---+---+
|             0.0|10731|   4|   6|   0|  0|  0|  0|  0|  0|  0|
|             5.0|    2|   0| 107|  10| 55|375|  0|  3| 29|  1|
|             1.0|    0|5165|   1|   0|  1|  0|  0|  0|  0|  0|
|             2.0|   47|  81|2107| 273|340| 23|129|118|  4|  3|
|             3.0|  233| 352| 533|1420|170|593| 47| 61| 71|  9|
|             4.0|   10|   9| 615| 108|655| 95| 43| 17| 13|  0|
+----------------+-----+----+----+----+---+---+---+---+---+---+



### 3.2 Random Forest Algorithm

In [None]:
from pyspark.ml.classification import RandomForestClassifier

#### 3.2.1 Training and Predicting of Model

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
modelrf = rf.fit(train)
predictionrf = modelrf.transform(test)
predictionrf.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(5990.0, 0.127695, 4.0, 4.0, 1954.0, 2170.0, 5...",4.0,"[2.126080365437323, 0.3498441422312224, 4.4660...","[0.21260803654373225, 0.034984414223122234, 0....",2.0
1,"(8454.0, 1.375599, 4.0, 4.0, 1676.0, 2776.0, 5...",2.0,"[2.220944670743943, 0.3889904825653168, 4.7273...","[0.2220944670743943, 0.03889904825653168, 0.47...",2.0
2,"(8786.0, 17.625374, 250.0, 24.0, 248562.0, 165...",2.0,"[1.5329423830961466, 0.3972663621003929, 4.980...","[0.15329423830961467, 0.03972663621003929, 0.4...",2.0


#### 3.2.2 Calculation of Accuracy

In [None]:
evaluatorrf = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rf = evaluatorrf.evaluate(predictionrf)

print("--- Random Forest Tree --- ")
print("Accuracy Rate =", round(rf,4))
print("  Error  Rate = %g " % round((1.0 - rf),4))

--- Random Forest Tree --- 
Accuracy Rate = 0.8052
  Error  Rate = 0.1948 


#### 3.2.3 Confusion Matrix of Decision Tree

In [None]:
predictionAndLabel = predictionrf.select("prediction", "label").rdd

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabel)
cm = metrics.confusionMatrix()
rows = cm.toArray().tolist()

confusion_matrix = spark.createDataFrame(rows,["normal","anomaly"])
confusion_matrix.show()

+-------+-------+------+-----+---+-----+---+---+---+---+
| normal|anomaly|    _3|   _4| _5|   _6| _7| _8| _9|_10|
+-------+-------+------+-----+---+-----+---+---+---+---+
|10905.0|    0.0| 110.0|  7.0|0.0|  1.0|0.0|0.0|0.0|0.0|
|   56.0| 5392.0| 142.0| 16.0|0.0|  5.0|0.0|0.0|0.0|0.0|
|  372.0|    3.0|2956.0| 24.0|0.0| 14.0|0.0|0.0|0.0|0.0|
| 1168.0|    0.0| 433.0|209.0|0.0|  1.0|0.0|0.0|0.0|0.0|
|   98.0|   10.0|1091.0|  5.0|3.0| 14.0|0.0|0.0|0.0|0.0|
|  202.0|    0.0| 482.0|  3.0|0.0|399.0|0.0|0.0|0.0|0.0|
|   20.0|    0.0| 199.0|  0.0|0.0|  0.0|0.0|0.0|0.0|0.0|
|   17.0|    4.0| 177.0|  0.0|0.0|  1.0|0.0|0.0|0.0|0.0|
|   28.0|    0.0|  67.0| 13.0|0.0|  9.0|0.0|0.0|0.0|0.0|
|    2.0|    0.0|  11.0|  0.0|0.0|  0.0|0.0|0.0|0.0|0.0|
+-------+-------+------+-----+---+-----+---+---+---+---+



In [None]:
predictionAndLabels = predictionrf.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
metrics.confusionMatrix()

DenseMatrix(10, 10, [10905.0, 56.0, 372.0, 1168.0, 98.0, 202.0, 20.0, 17.0, ..., 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 0)

In [None]:
predictionrf.withColumn("A", funcs.struct("prediction","label")).crosstab("prediction","label").show()

+----------------+-----+----+----+----+----+---+---+---+---+---+
|prediction_label|  0.0| 1.0| 2.0| 3.0| 4.0|5.0|6.0|7.0|8.0|9.0|
+----------------+-----+----+----+----+----+---+---+---+---+---+
|             0.0|10905|  56| 372|1168|  98|202| 20| 17| 28|  2|
|             5.0|    1|   5|  14|   1|  14|399|  0|  1|  9|  0|
|             1.0|    0|5392|   3|   0|  10|  0|  0|  4|  0|  0|
|             2.0|  110| 142|2956| 433|1091|482|199|177| 67| 11|
|             3.0|    7|  16|  24| 209|   5|  3|  0|  0| 13|  0|
|             4.0|    0|   0|   0|   0|   3|  0|  0|  0|  0|  0|
+----------------+-----+----+----+----+----+---+---+---+---+---+



In [None]:
predictionrfevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
predictionrfevaluator.evaluate(predictionrf)      

predictionrf.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  8.0|       3.0|   13|
|  2.0|       0.0|  372|
|  3.0|       5.0|    1|
|  8.0|       5.0|    9|
|  0.0|       5.0|    1|
|  5.0|       2.0|  482|
|  8.0|       0.0|   28|
|  1.0|       1.0| 5392|
|  9.0|       0.0|    2|
|  7.0|       1.0|    4|
|  3.0|       2.0|  433|
|  4.0|       5.0|   14|
|  4.0|       2.0| 1091|
|  9.0|       2.0|   11|
|  7.0|       2.0|  177|
|  2.0|       2.0| 2956|
|  1.0|       0.0|   56|
|  5.0|       3.0|    3|
|  6.0|       2.0|  199|
|  2.0|       3.0|   24|
+-----+----------+-----+
only showing top 20 rows



### 3.3 Naive Bayes Algorithm

In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### 3.3.1 Training and Predicting of Model

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

modelnb = nb.fit(train)
predictionnb = modelnb.transform(test)
predictionnb.toPandas().head(3)

Unnamed: 0,features,label,rawPrediction,probability,prediction
0,"(5990.0, 0.127695, 4.0, 4.0, 1954.0, 2170.0, 5...",4.0,"[-1328365.6312262518, -1272030.02668413, -1606...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
1,"(8454.0, 1.375599, 4.0, 4.0, 1676.0, 2776.0, 5...",2.0,"[-313832.464825093, -301525.3775745601, -35010...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
2,"(8786.0, 17.625374, 250.0, 24.0, 248562.0, 165...",2.0,"[-4013470.1830401034, -3401162.323599091, -347...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",4.0


#### 3.3.2 Calculation of Accuracy

In [None]:
evaluatornb = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
nb = evaluatornb.evaluate(predictionnb)

print("--- Naive Bayes --- ")
print("Accuracy Rate =", round(nb,4))
print("  Error  Rate = %g " % round((1.0 - nb),4))

--- Naive Bayes --- 
Accuracy Rate = 0.4712
  Error  Rate = 0.5288 


### 3.4 Gradient Boost Tree

In [None]:
from pyspark.ml.classification import GBTClassifier

#### 3.4.1 Training and Predicting of Model

In [None]:
#gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

#modelgbt = gbt.fit(train)
#predictiongbt = modelgbt.transform(test)
#predictiongbt.toPandas().head(3)

Py4JJavaError: ignored

#### 3.4.2 Calculation of Accuracy

In [None]:
#evaluatorgbt = MulticlassClassificationEvaluator(
  #  labelCol="label", predictionCol="prediction", metricName="accuracy")
#gbt = evaluatorgbt.evaluate(predictiongbt)

print("--- Gradient Boost Tree --- ")
print("Accuracy Rate =", round(gbt,4))
print("  Error  Rate = %g " % round((1.0 - gbt),4))

### 3.5 Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression

#### 3.5.1 Training and Predicting of Model

In [None]:
lr = LogisticRegression(regParam=0.01)
modellr = lr.fit(train)
predictionlr = modellr.transform(test)
predictionlr.toPandas().head(3)

#### 3.5.2 Calculation of Accuracy

In [None]:
evaluatorlr = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
lr = evaluatorlr.evaluate(predictionlr)

In [None]:
print("--- Logistic Regression --- ")
print("Accuracy Rate =", round(lr,4))
print("  Error  Rate = %g " % round((1.0 - lr),4))

--- Logistic Regression --- 
Accuracy Rate = 0.7602
  Error  Rate = 0.2398 


## 3.6 Comparison of Accucary  Rate of Algorithms

In [None]:
#print("Gradient Boost Tree Accuracy =", round(gbt,5))
print("      Decision Tree Accuracy =", round(dt,5))
print(" Random Forest Tree Accuracy =", round(rf,5))
print("Logistic Regression Accuracy =", round(lr,5))
print("        Naive Bayes Accuracy =", round(nb,5))

      Decision Tree Accuracy = 0.8291
 Random Forest Tree Accuracy = 0.80522
Logistic Regression Accuracy = 0.76018
        Naive Bayes Accuracy = 0.4712
