# call the libraries

In [1]:
from pyspark.sql import SparkSession
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import io
import numpy as np
import os
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
import scipy.stats as stats
import numpy as np
from sklearn.ensemble import IsolationForest
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import pickle

from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import matplotlib.pyplot as plt
import numpy as np

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go



import numpy as np
import plotly
init_notebook_mode(connected=True)
%matplotlib inline



# initialize spark and spark context

In [2]:
spark = SparkSession.builder.appName('itd-anomaly-profile-windowsos-ip-sklearn-isolationforest-batch-score').getOrCreate()
sc = spark.sparkContext
sc.version

'2.1.1'

# lets load the Profile data for DOME9 with 3 attributes

In [3]:
data_source = "windowsos"
tenant_name = "itd"
time_window="day"
entity_type="ip"
anomaly_type="profile"
model_type="sklearn"
model_name="isolationforest"

BASE_PATH = "/Users/tuhinsharma/Documents/sstech/"+tenant_name
ANOMALY_DATA_REPOSITORY = BASE_PATH + "/models_data/data"

USER_PROFILE_DATA_PATH = ANOMALY_DATA_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}.json"
data_path = USER_PROFILE_DATA_PATH.format\
                                           (data_source=data_source,\
                                            entity_type=entity_type,anomaly_type="profile",time_window=time_window)
    
ANOMALY_MODEL_REPOSITORY = BASE_PATH + "/models_data/model"
PROFILE_ANOMALY_MODEL_PATH = ANOMALY_MODEL_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}/{model_type}/{model_name}"



In [4]:
ip_profile_sdf = spark.read.json(data_path).persist()
ip_profile_sdf.show(3)

+---------------+-----------------+----------------+-----------+---------------------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|
+---------------+-----------------+----------------+-----------+---------------------+
|           1426|            False|            True|  54.85.1.3|                  619|
|           1446|             True|            True| 54.85.5.15|                  655|
|           1441|             True|            True|54.85.10.16|                  614|
+---------------+-----------------+----------------+-----------+---------------------+
only showing top 3 rows



In [5]:
ip_profile_df = ip_profile_sdf.toPandas()

In [6]:
model_path = PROFILE_ANOMALY_MODEL_PATH.format(data_source=data_source,\
                                  entity_type=entity_type,anomaly_type=anomaly_type,time_window=time_window,\
                                 model_type=model_type,model_name=model_name)

### Load the Isolation Forest IF Pipelinemodel

In [7]:
model_dict = sc.pickleFile(model_path+"/if_pipeline_model").collect()[0]  
standard_scaler = model_dict["standard_scaler"]
one_hot_encoder = model_dict["one_hot_encoder"]
isolation_forest_model = model_dict["isolation_forest_model"]
cat_colnames = model_dict["cat_colnames"]
num_colnames = model_dict["num_colnames"]


### Load the Scoring Pipelinemodel

In [8]:
scoring_pipeline_model = sc.pickleFile(model_path+"/scoring_pipeline_model").collect()[0]  

In [9]:
if len(num_colnames)>0:
    num_data = ip_profile_df[num_colnames].values.astype(np.float64)
    
if len(cat_colnames)>0:
    cat_data = ip_profile_df[cat_colnames].values

if len(num_colnames)>0:
    num_data_normalized = standard_scaler.transform(num_data)
if len(cat_colnames)>0:
    cat_data_encoded = one_hot_encoder.transform(cat_data).toarray()

if len(num_colnames)>0 and len(cat_colnames)>0:
    data = np.concatenate((num_data_normalized,cat_data_encoded),axis=1)
elif len(cat_colnames):
    data = cat_data_encoded
elif len(num_colnames):
    data = num_data_normalized
    
score = isolation_forest_model.decision_function(data).reshape(-1, 1)*-1
pas = scoring_pipeline_model.transform(score)
ip_profile_df["PAS"]=pas

In [10]:
result_score_sdf = spark.createDataFrame(ip_profile_df)

## stats for PAS

In [11]:
result_score_sdf.select("PAS").describe().show()

+-------+------------------+
|summary|               PAS|
+-------+------------------+
|  count|              1059|
|   mean|14.280604080090486|
| stddev| 14.54744239878091|
|    min|               0.0|
|    max|             100.0|
+-------+------------------+



# get binary response as normal/anomaly

In [12]:
def is_anomaly(value):
    if value > 60:
        return 1
    else:
        return 0
udf_is_anomaly = udf(is_anomaly,IntegerType())
result_score_sdf = result_score_sdf.withColumn("anomaly", udf_is_anomaly("PAS"))
result_score_sdf.show(4)

+---------------+-----------------+----------------+-----------+---------------------+------------------+-------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|               PAS|anomaly|
+---------------+-----------------+----------------+-----------+---------------------+------------------+-------+
|           1426|            False|            True|  54.85.1.3|                  619|2.4259972754160124|      0|
|           1446|             True|            True| 54.85.5.15|                  655|20.587261697130163|      0|
|           1441|             True|            True|54.85.10.16|                  614| 3.972240503350349|      0|
|           1421|            False|            True|54.85.20.40|                  615| 3.656995395352329|      0|
+---------------+-----------------+----------------+-----------+---------------------+------------------+-------+
only showing top 4 rows



# get the normal records

In [13]:
normal_sdf = result_score_sdf.where(result_score_sdf['anomaly'] == 0)
normal_sdf.select("anomaly").distinct().show()
print("normal record count {count}".format(count=normal_sdf.count()))

+-------+
|anomaly|
+-------+
|      0|
+-------+

normal record count 1040


# get the anomaly records

In [14]:
anomaly_sdf = result_score_sdf.where(result_score_sdf['anomaly'] == 1)
anomaly_sdf.select("anomaly").distinct().show()
print("anomaly record count {count}".format(count=anomaly_sdf.count()))

+-------+
|anomaly|
+-------+
|      1|
+-------+

anomaly record count 19
