# call the libraries

In [1]:
from pyspark.sql import SparkSession
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import io
import numpy as np
import os


import scipy.stats as stats
import numpy as np
from sklearn.ensemble import IsolationForest
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder
%matplotlib inline
import pickle

# initialize spark and spark context

In [2]:
spark = SparkSession\
.builder\
.appName("itd-anomaly-event-wgtraffic-sklearn-isolationforest-batch-train")\
.master("local[*]")\
.config("spark.sql.warehouse.dir", "/user/hive/warehouse")\
.enableHiveSupport()\
.getOrCreate()
sc = spark.sparkContext
sc.version

'2.1.1'

In [3]:
data_source = "wgtraffic"
tenant_name = "demo"
anomaly_type="event"
model_type="sklearn"
model_name="isolationforest"

BASE_PATH = "/Users/tuhinsharma/Documents/sstech/"+tenant_name
ANOMALY_DATA_REPOSITORY = BASE_PATH + "/models_data/data"
    
ANOMALY_MODEL_REPOSITORY = BASE_PATH + "/models_data/model"
PROFILE_ANOMALY_MODEL_PATH = ANOMALY_MODEL_REPOSITORY + "/{data_source}/{anomaly_type}/{model_type}/{model_name}"


# Lets load the Event data for wgtraffic with 3 attributes

In [4]:
event_sdf = spark.sql("select dst_port, info_2, info_1, src_port from "+tenant_name+"."+data_source)
event_sdf.show(3)
event_sdf.rdd.take(1)

+--------+----------------+--------------------+--------+
|dst_port|          info_2|              info_1|src_port|
+--------+----------------+--------------------+--------+
|     443| sent_bytes=7947|rcvd_bytes=18659;...|   56098|
|     443|sent_bytes=10897|rcvd_bytes=127098...|   23554|
|     443|sent_bytes=16704|rcvd_bytes=75672;...|   28491|
+--------+----------------+--------------------+--------+
only showing top 3 rows



[Row(dst_port='443', info_2='sent_bytes=7947', info_1='rcvd_bytes=18659; app_id=0', src_port='56098')]

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *

def pre_process_info_1(value):
    if len(value)>0:
        value = int(value.replace("rcvd_bytes=","").replace("; app_id=0",""))
    else:
        value = 0
    return value

def pre_process_info_2(value):
    if len(value)>0:
        value = int(value.replace("sent_bytes=",""))
    else:
        value = 0
    return value


udf_pre_process_info_1 = udf(pre_process_info_1, IntegerType())
udf_pre_process_info_2 = udf(pre_process_info_2, IntegerType())



event_sdf = event_sdf.withColumn("info_1", udf_pre_process_info_1(col("info_1")))
event_sdf = event_sdf.withColumn("info_2", udf_pre_process_info_2(col("info_2")))


event_sdf.show(3)

+--------+------+------+--------+
|dst_port|info_2|info_1|src_port|
+--------+------+------+--------+
|     443|  7947| 18659|   56098|
|     443| 10897|127098|   23554|
|     443| 16704| 75672|   28491|
+--------+------+------+--------+
only showing top 3 rows



## Define the columns on which Model shall be trained

In [7]:
cat_colnames = ["dst_port","src_port"]
num_colnames = ["info_2","info_1"]
event_df = event_sdf.toPandas()
num_data = event_df[num_colnames].values.astype(np.float64)
if len(cat_colnames)>0:
    cat_data = event_df[cat_colnames].values

# Batch Training

### IsolationForest Pipelinemodel

In [8]:
standard_scaler = None
if len(num_colnames)>0:
    standard_scaler = StandardScaler()
    num_data_normalized = standard_scaler.fit_transform(num_data)

In [9]:
one_hot_encoder = None
if len(cat_colnames)>0:
    one_hot_encoder = OneHotEncoder(categories='auto')
    cat_data_encoded = one_hot_encoder.fit_transform(cat_data).toarray()

In [10]:
if len(num_colnames)>0 and len(cat_colnames)>0:
    data = np.concatenate((num_data_normalized,cat_data_encoded),axis=1)
elif len(cat_colnames):
    data = cat_data_encoded
elif len(num_colnames):
    data = num_data_normalized

In [11]:
isolation_forest = IsolationForest(behaviour='new',n_estimators=1000,max_samples=0.3,max_features=min(2,len(num_colnames+cat_colnames)),bootstrap=True,
                                         contamination="auto",
                                         random_state=42)
isolation_forest_model = isolation_forest.fit(data)

In [12]:
event_df["score"] = isolation_forest_model.decision_function(data).reshape(-1, 1)*-1

### scoring pipelinemodel

In [13]:
score = event_df["score"].values.reshape(-1, 1)
minmax_scaler = MinMaxScaler(feature_range=(0,100))
scoring_pipeline = Pipeline(steps=[("MinMaxScaler",minmax_scaler)])
scoring_pipeline_model = scoring_pipeline.fit(score)

In [15]:
model_path = PROFILE_ANOMALY_MODEL_PATH.format(data_source=data_source,\
                                  anomaly_type=anomaly_type,\
                                 model_type=model_type,model_name=model_name)

os.system("hdfs dfs -rm -r "+model_path)

256

### Save the Clustering Pipelinemodel

In [16]:
isolation_forest_model_rdd = sc.parallelize([{"cat_colnames":cat_colnames,"num_colnames":num_colnames,"standard_scaler":standard_scaler,"one_hot_encoder":one_hot_encoder,"isolation_forest_model":isolation_forest_model}])
isolation_forest_model_rdd.saveAsPickleFile(model_path+"/if_pipeline_model")

### Save the Scoring Pipelinemodel

In [17]:
scoring_pipeline_model_rdd = sc.parallelize([scoring_pipeline_model])
scoring_pipeline_model_rdd.saveAsPickleFile(model_path+"/scoring_pipeline_model")