In [1]:
pip install -q findspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
import findspark


findspark.init('/home/bigdata/Documents/spark-3.0.0')

In [3]:
from pyspark import SparkConf, SparkContext, Row
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark import*
from pyspark.sql.functions import col, when
import pandas as pd
import numpy as np
import sys
import os

In [4]:
#DataFrame operations inside your streaming program
def getSparkSessionInstance(sparkConf):
    
    if ("sparkSessionSingletonInstance" not in globals()):
        globals()["sparkSessionSingletonInstance"] = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    return globals()["sparkSessionSingletonInstance"]

In [5]:
sc = SparkContext(appName= "UNSW")
ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint")

In [6]:
# Load data to create a DStream data

unsw_list = list()
# read the entire folder and add them into the Queue
entries = os.listdir('UNSW_data')
for entry in entries:
    print(entry)
    path="UNSW_data/" + entry
    unsw_list += [sc.textFile(path, 5)]

UNSW-NB15-8.csv
UNSW-NB15-43.csv
UNSW-NB15-39.csv
UNSW-NB15-3.csv
UNSW-NB15-1.csv
UNSW-NB15-14.csv
UNSW-NB15-17.csv
UNSW-NB15-25.csv
UNSW-NB15-27.csv
UNSW-NB15-46.csv
UNSW-NB15-4.csv
UNSW-NB15-7.csv
UNSW-NB15-29.csv
UNSW-NB15-9.csv
UNSW-NB15-38.csv
UNSW-NB15-6.csv
UNSW-NB15-49.csv
UNSW-NB15-2.csv
UNSW-NB15-5.csv
UNSW-NB15-30.csv
.~lock.UNSW-NB15-1.csv#
UNSW-NB15-10.csv
UNSW-NB15-18.csv
UNSW-NB15-28.csv
UNSW-NB15-35.csv
UNSW-NB15-33.csv
UNSW-NB15-22.csv
UNSW-NB15-19.csv


In [7]:
unsw_list[2].take(3)

['"59.166.0.8","48111","149.171.126.7","5190","tcp","FIN","1.6395999E-2","1036","2366","31","29","4","4","-","463527.72","1071968.9","12","14","255","255","2472443027","327436606","86","169","0","0","127.15737","3.16449","1421947909","1421947909","1.453818","1.209846","7.8599999E-4","6.5900001E-4","1.2700001E-4","0","0","0","0","0","7","9","3","1","1","1","1","","0"',
 '"59.166.0.7","14767","149.171.126.5","80","tcp","FIN","1.1078809","1684","10168","31","29","3","4","http","11293.632","69350.414","14","18","255","255","1514550030","3665297448","120","565","1","3924","9043.1875","7918.4985","1421947908","1421947909","85.194153","65.13736","6.9000002E-4","0.000543","0.000147","0","0","1","0","0","2","1","2","1","1","1","1","","0"',
 '"59.166.0.1","18659","149.171.126.7","14562","tcp","FIN","0.012538","2542","23122","31","29","7","14","-","1581751.5","1.4402297E+7","40","42","255","255","2716648099","572382181","64","551","0","0","18.4088","17.361631","1421947910","1421947910","0.312846"

In [8]:
rdd_stream = ssc.queueStream(unsw_list)

In [9]:
# Make the Queue Stream of data 
def AddNormalLabel(line1): 
            line =line1.replace('"','').split(",")
            if line[47]== '':
                line[47]= 'Normal'
            elif line[47] == 'Backdoor':
                line[47] ='Backdoors'
                   
            return line

In [10]:
words = rdd_stream.map(AddNormalLabel)

In [12]:
################ Solution (Transformation and Action) ################
datamap= words.filter(lambda x : x[47])
filtline = datamap.map(lambda x: x[47])
#streamdata =filtline.reduceByKey(lambda a, b: a+b)
count= filtline.countByValue()
count.pprint()

In [13]:
def process(time, rdd):
    print("========%s =========" % str(time))


    try:

        # Get the singleton instant of SparkSession
        spark = getSparkSessionInstance(rdd.context.getConf())
        

        #Convert RDD[String] to RDD[Row] to DataFrame
        rowRdd = rdd.map(lambda w: Row(word=w))     
        wordsDataFrame = spark.createDataFrame(rowRdd)
        
                          
              
        # Creates a temporary view using the DataFrame
        wordsDataFrame.createOrReplaceTempView("words")


        # Do word count on table using SQL and print it
        wordCountsDataFrame = spark.sql("select trim(word[47]) as attack_cat, count(*) as Total from words group by trim(word[47]) order by Total DESC")

        wordCountsDataFrame.show()
              
        worddf= spark.sql("select percentile_approx(word[11], 0.25) as sloss25, percentile_approx(word[11], 0.50) as sloss50, percentile_approx(word[11], 0.75) as sloss75, MAX(word[11]) as slossmax, avg(word[11]) as averagesloss, percentile_approx(word[22], 0.25) as tcprtt25, percentile_approx(word[22], 0.50) as tcprtt50, percentile_approx(word[22], 0.75) as tcprt75, MAX(word[22]) as tcprttmax , avg(word[22]) as tcprttavg from words")
      
        worddf.show()
        
    except:

         pass



words.foreachRDD(process)

In [14]:
ssc.start()

-------------------------------------------
Time: 2020-05-19 01:43:34
-------------------------------------------
('Normal', 35674)
('Fuzzers ', 325)
('Backdoors', 12)
('DoS', 107)
('Reconnaissance ', 254)
('Shellcode ', 25)
('Worms', 3)
('Generic', 13828)
('Exploits', 537)
('Analysis', 30)

+--------------+-----+
|    attack_cat|Total|
+--------------+-----+
|        Normal|35674|
|       Generic|13828|
|      Exploits|  537|
|       Fuzzers|  325|
|Reconnaissance|  254|
|           DoS|  107|
|      Analysis|   30|
|     Shellcode|   25|
|     Backdoors|   12|
|         Worms|    3|
+--------------+-----+

+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+-----------------+
|sloss25|sloss50|sloss75|slossmax|     averagesloss|tcprtt25|tcprtt50|tcprt75|tcprttmax|        tcprttavg|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+-----------------+
|    0.0|    0.0|    3.0|       9|2.862210847524363|    57.0|  

+--------------+-----+
|    attack_cat|Total|
+--------------+-----+
|        Normal|42859|
|       Generic| 2287|
|      Exploits| 2221|
|           DoS| 1432|
|       Fuzzers|  910|
|Reconnaissance|  629|
|     Backdoors|  201|
|      Analysis|  180|
|     Shellcode|   68|
|         Worms|    8|
+--------------+-----+

+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+------------------+
|sloss25|sloss50|sloss75|slossmax|     averagesloss|tcprtt25|tcprtt50|tcprt75|tcprttmax|         tcprttavg|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+------------------+
|    0.0|    3.0|    7.0|      99|5.319027463333005|    61.0|    73.0|  130.0|      994|129.22687272369328|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+------------------+

-------------------------------------------
Time: 2020-05-19 01:43:50
-------------------------------------------
('Normal', 38414)
('Fu

-------------------------------------------
Time: 2020-05-19 01:44:04
-------------------------------------------
('Normal', 44109)
('Fuzzers ', 728)
('DoS', 838)
('Backdoors', 98)
('Reconnaissance ', 632)
('Worms', 6)
('Shellcode ', 51)
('Exploits', 1754)
('Generic', 2497)
('Analysis', 82)

+--------------+-----+
|    attack_cat|Total|
+--------------+-----+
|        Normal|44109|
|       Generic| 2497|
|      Exploits| 1754|
|           DoS|  838|
|       Fuzzers|  728|
|Reconnaissance|  632|
|     Backdoors|   98|
|      Analysis|   82|
|     Shellcode|   51|
|         Worms|    6|
+--------------+-----+

+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+------------------+
|sloss25|sloss50|sloss75|slossmax|     averagesloss|tcprtt25|tcprtt50|tcprt75|tcprttmax|         tcprttavg|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+------------------+
|    0.0|    3.0|    7.0|      94|5.528713456048823|    61.0

+--------------+-----+
|    attack_cat|Total|
+--------------+-----+
|        Normal|42630|
|       Generic| 5479|
|      Exploits| 1053|
|       Fuzzers|  898|
|Reconnaissance|  396|
|           DoS|  218|
|     Shellcode|   51|
|      Analysis|   40|
|     Backdoors|   26|
|         Worms|    4|
+--------------+-----+

+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+-----------------+
|sloss25|sloss50|sloss75|slossmax|     averagesloss|tcprtt25|tcprtt50|tcprt75|tcprttmax|        tcprttavg|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+-----------------+
|    0.0|    2.0|    7.0|      97|4.560271680283493|    59.0|    73.0|  132.0|      994|122.4197263510188|
+-------+-------+-------+--------+-----------------+--------+--------+-------+---------+-----------------+

-------------------------------------------
Time: 2020-05-19 01:44:22
-------------------------------------------
('Normal', 50795)

+-------

In [17]:
ssc.stop(stopSparkContext=True, stopGraceFully=True)