# Agenda for Today's Demo

### 1. **Batch Model Training**

### 2. **Batch Scoring**

### 3. **Real-time Scoring**

# call the libraries

In [1]:
from pyspark.sql import SparkSession
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import io
import numpy as np
import scipy.stats as stats
import pylab as pl
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
import numpy as np
from pyspark.ml.feature import MinMaxScaler
%matplotlib inline



# initialize spark and spark context

In [2]:
spark = SparkSession.builder.appName('itd-anomaly-profile-windowsos-ip-pyspark-kmeans-batch-train').getOrCreate()
sc = spark.sparkContext
sc.version

'2.1.1'

In [3]:
data_source = "windowsos"
tenant_name = "itd"
time_window="day"
entity_type="ip"
anomaly_type="profile"
model_type="pyspark"
model_name="kmeans"

BASE_PATH = "/Users/tuhinsharma/Documents/sstech/"+tenant_name
ANOMALY_DATA_REPOSITORY = BASE_PATH + "/models_data/data"

USER_PROFILE_DATA_PATH = ANOMALY_DATA_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}.json"
data_path = USER_PROFILE_DATA_PATH.format\
                                           (data_source=data_source,\
                                            entity_type=entity_type,anomaly_type="profile",time_window=time_window)
    
ANOMALY_MODEL_REPOSITORY = BASE_PATH + "/models_data/model"
PROFILE_ANOMALY_MODEL_PATH = ANOMALY_MODEL_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}/{model_type}/{model_name}"


# Lets load the Profile data for DOME9 with 3 attributes

In [4]:
ip_profile_sdf = spark.read.json(data_path).persist()
ip_profile_sdf.show(3)

+---------------+-----------------+----------------+-----------+---------------------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|
+---------------+-----------------+----------------+-----------+---------------------+
|           1426|            False|            True|  54.85.1.3|                  619|
|           1446|             True|            True| 54.85.5.15|                  655|
|           1441|             True|            True|54.85.10.16|                  614|
+---------------+-----------------+----------------+-----------+---------------------+
only showing top 3 rows



## Define the columns on which Model shall be trained

In [5]:
cat_colnames = ["increase_activity","lateral_movement"]
num_colnames = ["upload_download_ratio","dns_error_count"]

# Batch Training

### Clustering Pipelinemodel

In [6]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
string_indexer_list = list()
one_hot_encoder_list = list()
for cat_colname in cat_colnames:
    string_indexer = StringIndexer(inputCol=cat_colname, outputCol=cat_colname+"_index")
    string_indexer_list.append(string_indexer)
    one_hot_encoder = OneHotEncoder(inputCol=cat_colname+"_index", outputCol=cat_colname+"_vec")
    one_hot_encoder_list.append(one_hot_encoder)


In [7]:
out = []
pipe = []
if len(num_colnames)>0:
    assembler = VectorAssembler(inputCols=num_colnames,outputCol="features_vec")
    standard_scaler = StandardScaler(inputCol="features_vec", outputCol="features_zs",withMean=True,withStd=True)
    out = [standard_scaler.getOutputCol()]
    pipe = [assembler,standard_scaler]
assembler_2 = VectorAssembler(inputCols=[x.getOutputCol() for x in one_hot_encoder_list]+out,outputCol="features")
estimator = KMeans(featuresCol="features",predictionCol="cluster_id",k=4)

clustering_pipeline = Pipeline(stages=string_indexer_list+one_hot_encoder_list+pipe+[assembler_2]+[estimator])
clustering_pipeline_model = clustering_pipeline.fit(ip_profile_sdf)


In [8]:
result_cluster_sdf = clustering_pipeline_model.transform(ip_profile_sdf).persist()
result_cluster_sdf.show(3)

+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|increase_activity_index|lateral_movement_index|increase_activity_vec|lateral_movement_vec|  features_vec|         features_zs|            features|cluster_id|
+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+
|           1426|            False|            True|  54.85.1.3|                  619|                    0.0|                   0.0|        (1,[0],[1.0])|       (1,[0],[1.0])|[619.0,1426.0]|[-0.9821795807138...|[1.0,1.0,-0.98217...|         1|
|           1446|   

In [9]:
result_cluster_sdf.rdd.map(lambda x : x["cluster_id"]).distinct().collect()

[1, 2, 0, 3]

### For each data points calculate sum of euclidean distance (SED) for all the cluster centroids

In [10]:
def calculate_SED_from_all_centroids(value,centroids):
    single_point = value
    points = centroids

    dist = (points - single_point)**2
    dist = np.sum(dist, axis=1)
    dist = np.sqrt(dist)
    dist = np.sum(dist)
    return float(dist)

def udf_calculate_SED(centroid_list):
    return udf(lambda l: calculate_SED_from_all_centroids(l, centroid_list),FloatType())

centroids = np.array(clustering_pipeline_model.stages[-1].clusterCenters())
print("number of centroids : ",centroids.shape[0])
print(centroids)

result_score_sdf = result_cluster_sdf.withColumn("sed", udf_calculate_SED(centroids)(col("features")))
result_score_sdf.show(3)

number of centroids :  4
[[ 0.          0.          0.95540729 -0.94494604]
 [ 0.5         0.5        -0.9752894   0.97091796]
 [ 0.50364964  1.          0.96149744 -0.95994225]
 [ 1.          0.          0.9716975  -0.96771848]]
+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+--------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|increase_activity_index|lateral_movement_index|increase_activity_vec|lateral_movement_vec|  features_vec|         features_zs|            features|cluster_id|     sed|
+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+--------+
|       

### Calculate Anomaly Score based on the sed

In [11]:
assembler = VectorAssembler(inputCols=["sed"],outputCol="sed_vec")
scaler = MinMaxScaler(inputCol="sed_vec", outputCol="scaled_sed")

scoring_pipeline = Pipeline(stages=[assembler,scaler])
scoring_pipeline_model = scoring_pipeline.fit(result_score_sdf)

result_scaler_sdf = scoring_pipeline_model.transform(result_score_sdf)
result_scaler_sdf.show(3)

+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+--------+-------------------+--------------------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|increase_activity_index|lateral_movement_index|increase_activity_vec|lateral_movement_vec|  features_vec|         features_zs|            features|cluster_id|     sed|            sed_vec|          scaled_sed|
+---------------+-----------------+----------------+-----------+---------------------+-----------------------+----------------------+---------------------+--------------------+--------------+--------------------+--------------------+----------+--------+-------------------+--------------------+
|           1426|            False|            True|  54.85.1.3|                  619|                    0.0|     

In [12]:
model_path = PROFILE_ANOMALY_MODEL_PATH.format(data_source=data_source,\
                                  entity_type=entity_type,anomaly_type=anomaly_type,time_window=time_window,\
                                 model_type=model_type,model_name=model_name)

### Save the Clustering Pipelinemodel

In [13]:

clustering_pipeline_model.write().overwrite().save(model_path+"/clustering_pipeline_model")


### Save the Scoring Pipelinemodel

In [14]:

scoring_pipeline_model.write().overwrite().save(model_path+"/scoring_pipeline_model")
