In [1]:
 # Copyright 2015 and onwards Sanford Ryza, Uri Laserson, Sean Owen and Joshua Wills
#
# See LICENSE file for further information.

# This block only needed if running via RStudio:
# If not set already, set to a Spark distro home directory, and Java home dir
Sys.setenv(SPARK_HOME = "/usr/local/spark")
# Sys.setenv(JAVA_HOME = "/path/to/java")
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
# Set this appropriately for your cluster
sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "4g"))

clusters_data <- read.df("hdfs://localhost:9000/ds/kddcup.data", "csv",
                         inferSchema = "true", header = "false")
colnames(clusters_data) <- c(
  "duration", "protocol_type", "service", "flag",
  "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
  "hot", "num_failed_logins", "logged_in", "num_compromised",
  "root_shell", "su_attempted", "num_root", "num_file_creations",
  "num_shells", "num_access_files", "num_outbound_cmds",
  "is_host_login", "is_guest_login", "count", "srv_count",
  "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
  "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
  "dst_host_count", "dst_host_srv_count",
  "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
  "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
  "dst_host_serror_rate", "dst_host_srv_serror_rate",
  "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
  "label")

numeric_only <- cache(drop(clusters_data,
                           c("protocol_type", "service", "flag", "label")))

kmeans_model <- spark.kmeans(numeric_only, ~ .,
                             k = 100, maxIter = 40, initMode = "k-means||")

clustering <- predict(kmeans_model, numeric_only)
clustering_sample <- collect(sample(clustering, FALSE, 0.01))

str(clustering_sample)

clusters <- clustering_sample["prediction"]
data <- data.matrix(within(clustering_sample, rm("prediction")))

table(clusters)
# clusters
# 0    11    14    23    25    28    31    33    36    48    64    83    89
# 47146     1     1     4   278   109    42  1190    13     1     2     1     2


# install.packages("rgl") # First time only
library(rgl)

# Make a random 3D projection and normalize
random_projection <- matrix(data = rnorm(3*ncol(data)), ncol = 3)
random_projection_norm <-
  random_projection / sqrt(rowSums(random_projection*random_projection))

# Project and make a new data frame
projected_data <- data.frame(data %*% random_projection_norm)

num_clusters <- max(clusters)
palette <- rainbow(num_clusters)
colors = sapply(clusters, function(c) palette[c])
# plot3d(projected_data, col = colors, size = 10)

unpersist(numeric_only)


Attaching package: ‘SparkR’


The following objects are masked from ‘package:stats’:

    cov, filter, lag, na.omit, predict, sd, var, window


The following objects are masked from ‘package:base’:

    as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
    rank, rbind, sample, startsWith, subset, summary, transform, union


Spark package found in SPARK_HOME: /usr/local/spark



Launching java with spark-submit command /usr/local/spark/bin/spark-submit   --driver-memory "4g" sparkr-shell /tmp/RtmpTmefaY/backend_port6c1356a8e286 


Java ref type org.apache.spark.sql.SparkSession id 1 

'data.frame':	48981 obs. of  39 variables:
 $ duration                   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ src_bytes                  : int  278 212 208 200 180 255 236 240 231 224 ...
 $ dst_bytes                  : int  957 294 4106 1061 559 16123 2063 2118 8766 1658 ...
 $ land                       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ wrong_fragment             : int  0 0 0 0 0 0 0 0 0 0 ...
 $ urgent                     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ hot                        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ num_failed_logins          : int  0 0 0 0 0 0 0 0 0 0 ...
 $ logged_in                  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ num_compromised            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ root_shell                 : int  0 0 0 0 0 0 0 0 0 0 ...
 $ su_attempted               : int  0 0 0 0 0 0 0 0 0 0 ...
 $ num_root                   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ num_file_creations         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ num_shells                 : int  0 0 0 0 0 0 0 0 0 

clusters
    0    16    28    29    36    37    40    49    55    60    64    97 
47647  1109     6    17     1   193     2     1     1     2     1     1 

Installing package into ‘/home/mrowacz/R/x86_64-pc-linux-gnu-library/3.4’
(as ‘lib’ is unspecified)



SparkDataFrame[duration:int, src_bytes:int, dst_bytes:int, land:int, wrong_fragment:int, urgent:int, hot:int, num_failed_logins:int, logged_in:int, num_compromised:int, root_shell:int, su_attempted:int, num_root:int, num_file_creations:int, num_shells:int, num_access_files:int, num_outbound_cmds:int, is_host_login:int, is_guest_login:int, count:int, srv_count:int, serror_rate:double, srv_serror_rate:double, rerror_rate:double, srv_rerror_rate:double, same_srv_rate:double, diff_srv_rate:double, srv_diff_host_rate:double, dst_host_count:int, dst_host_srv_count:int, dst_host_same_srv_rate:double, dst_host_diff_srv_rate:double, dst_host_same_src_port_rate:double, dst_host_srv_diff_host_rate:double, dst_host_serror_rate:double, dst_host_srv_serror_rate:double, dst_host_rerror_rate:double, dst_host_srv_rerror_rate:double]

In [2]:
plot3d(projected_data, col = colors, size = 10)