# call the libraries

In [1]:
from pyspark.sql import SparkSession
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import io
import numpy as np
import os


import scipy.stats as stats
import numpy as np
from sklearn.svm import OneClassSVM
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder
%matplotlib inline
import pickle

# initialize spark and spark context

In [2]:
spark = SparkSession.builder.appName('itd-anomaly-profile-windowsos-ip-sklearn-oneclasssvm-batch-train').getOrCreate()
sc = spark.sparkContext
sc.version

'2.1.1'

In [3]:
data_source = "windowsos"
tenant_name = "itd"
time_window="day"
entity_type="ip"
anomaly_type="profile"
model_type="sklearn"
model_name="oneclasssvm"

BASE_PATH = "/Users/tuhinsharma/Documents/sstech/"+tenant_name
ANOMALY_DATA_REPOSITORY = BASE_PATH + "/models_data/data"

USER_PROFILE_DATA_PATH = ANOMALY_DATA_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}.json"
data_path = USER_PROFILE_DATA_PATH.format\
                                           (data_source=data_source,\
                                            entity_type=entity_type,anomaly_type="profile",time_window=time_window)
    
ANOMALY_MODEL_REPOSITORY = BASE_PATH + "/models_data/model"
PROFILE_ANOMALY_MODEL_PATH = ANOMALY_MODEL_REPOSITORY + "/{data_source}/{entity_type}/{anomaly_type}/{time_window}/{model_type}/{model_name}"


# Lets load the Profile data for DOME9 with 3 attributes

In [4]:
ip_profile_sdf = spark.read.json(data_path).persist()
ip_profile_sdf.show(3)

+---------------+-----------------+----------------+-----------+---------------------+
|dns_error_count|increase_activity|lateral_movement|     src_ip|upload_download_ratio|
+---------------+-----------------+----------------+-----------+---------------------+
|           1426|            False|            True|  54.85.1.3|                  619|
|           1446|             True|            True| 54.85.5.15|                  655|
|           1441|             True|            True|54.85.10.16|                  614|
+---------------+-----------------+----------------+-----------+---------------------+
only showing top 3 rows



## Define the columns on which Model shall be trained

In [5]:
cat_colnames = ["increase_activity","lateral_movement"]
num_colnames = ["upload_download_ratio","dns_error_count"]
ip_profile_df = ip_profile_sdf.toPandas()
num_data = ip_profile_df[num_colnames].values.astype(np.float64)
if len(cat_colnames)>0:
    cat_data = ip_profile_df[cat_colnames].values

# Batch Training

### OneclassSVM Pipelinemodel

In [6]:
standard_scaler = None
if len(num_colnames)>0:
    standard_scaler = StandardScaler()
    num_data_normalized = standard_scaler.fit_transform(num_data)

In [7]:
one_hot_encoder = None
if len(cat_colnames)>0:
    one_hot_encoder = OneHotEncoder()
    cat_data_encoded = one_hot_encoder.fit_transform(cat_data).toarray()

In [8]:
if len(num_colnames)>0 and len(cat_colnames)>0:
    data = np.concatenate((num_data_normalized,cat_data_encoded),axis=1)
elif len(cat_colnames):
    data = cat_data_encoded
elif len(num_colnames):
    data = num_data_normalized

In [9]:
svm = OneClassSVM(kernel="rbf",gamma=0.1)
svm_model = svm.fit(data)

In [10]:
ip_profile_df["score"] = svm_model.decision_function(data).reshape(-1, 1)*-1

### scoring pipelinemodel

In [11]:
score = ip_profile_df["score"].values.reshape(-1, 1)
minmax_scaler = MinMaxScaler(feature_range=(0,100))
scoring_pipeline = Pipeline(steps=[("MinMaxScaler",minmax_scaler)])
scoring_pipeline_model = scoring_pipeline.fit(score)

In [12]:
model_path = PROFILE_ANOMALY_MODEL_PATH.format(data_source=data_source,\
                                  entity_type=entity_type,anomaly_type=anomaly_type,time_window=time_window,\
                                 model_type=model_type,model_name=model_name)

os.system("hdfs dfs -rm -r "+model_path)

0

### Save the Clustering Pipelinemodel

In [13]:
svm_pipeline_model_rdd = sc.parallelize([{"cat_colnames":cat_colnames,"num_colnames":num_colnames,"standard_scaler":standard_scaler,"one_hot_encoder":one_hot_encoder,"svm_model":svm_model}])
svm_pipeline_model_rdd.saveAsPickleFile(model_path+"/svm_pipeline_model")


### Save the Scoring Pipelinemodel

In [14]:
scoring_pipeline_model_rdd = sc.parallelize([scoring_pipeline_model])
scoring_pipeline_model_rdd.saveAsPickleFile(model_path+"/scoring_pipeline_model")