# We read merged data

In [1]:

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext


sqlCtx = SQLContext(sc)

data_merged = sqlCtx.read.load('file:///home/lajotadeladerrota/Escritorio/Universidad/4Curso/MachineLearning/LAB/MachineLearningLAB/milestone3_spark/data/data_merged.csv',
                   format='com.databricks.spark.csv', header='true',
                    inferSchema='true')

## Check columns

In [2]:
data_merged.columns

['GyroscopeStat_x_MEAN',
 'GyroscopeStat_z_MEAN',
 'GyroscopeStat_COV_z_x',
 'GyroscopeStat_COV_z_y',
 'MagneticField_x_MEAN',
 'MagneticField_z_MEAN',
 'MagneticField_COV_z_x',
 'MagneticField_COV_z_y',
 'Pressure_MEAN',
 'LinearAcceleration_COV_z_x',
 'LinearAcceleration_COV_z_y',
 'LinearAcceleration_x_MEAN',
 'LinearAcceleration_z_MEAN',
 'attack']

## PrintSchema

In [3]:
data_merged.printSchema()

root
 |-- GyroscopeStat_x_MEAN: double (nullable = true)
 |-- GyroscopeStat_z_MEAN: double (nullable = true)
 |-- GyroscopeStat_COV_z_x: double (nullable = true)
 |-- GyroscopeStat_COV_z_y: double (nullable = true)
 |-- MagneticField_x_MEAN: double (nullable = true)
 |-- MagneticField_z_MEAN: double (nullable = true)
 |-- MagneticField_COV_z_x: double (nullable = true)
 |-- MagneticField_COV_z_y: double (nullable = true)
 |-- Pressure_MEAN: double (nullable = true)
 |-- LinearAcceleration_COV_z_x: double (nullable = true)
 |-- LinearAcceleration_COV_z_y: double (nullable = true)
 |-- LinearAcceleration_x_MEAN: double (nullable = true)
 |-- LinearAcceleration_z_MEAN: double (nullable = true)
 |-- attack: double (nullable = true)



## We aggregate features to make predictions into a single column

In [5]:

features = ['GyroscopeStat_x_MEAN',
 'GyroscopeStat_z_MEAN',
 'GyroscopeStat_COV_z_x',
 'GyroscopeStat_COV_z_y',
 'MagneticField_x_MEAN',
 'MagneticField_z_MEAN',
 'MagneticField_COV_z_x',
 'MagneticField_COV_z_y',
 'Pressure_MEAN',
 'LinearAcceleration_COV_z_x',
 'LinearAcceleration_COV_z_y',
 'LinearAcceleration_z_MEAN']

## We use VectorAssembler

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols = features, outputCol="features")
assembled = assembler.transform(data_merged)

(trainingData, testData) = assembled.randomSplit([0.67,0.33], seed=13234)
trainingData.count(), testData.count()


(56, 16)

In [13]:
from pyspark.ml.classification import DecisionTreeClassifier

d_tree = DecisionTreeClassifier(labelCol = "attack", featuresCol = "features", maxDepth=1,
                                minInstancesPerNode = 20, impurity = "gini")


In [14]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[d_tree])
model = pipeline.fit(trainingData)

predictions = model.transform(testData)

In [15]:
predictions.select("prediction","attack").show()

+----------+------+
|prediction|attack|
+----------+------+
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       1.0|   1.0|
+----------+------+



In [152]:
predictions.select("prediction","attack").write.save(path="file:///home/lajotadeladerrota/Escritorio/Universidad/4Curso/MachineLearning/LAB/MachineLearningLAB/milestone3_spark/data/predictions_test",
                                                      format="com.databricks.spark.csv",header="true")

In [119]:
#Read predictions

In [10]:
predictions = sqlContext.read.load('file:///home/lajotadeladerrota/Escritorio/Universidad/4Curso/MachineLearning/LAB/MachineLearningLAB/milestone3_spark/data/predictions',
                                                      format="com.databricks.spark.csv",header="true", inferSchema="true")

In [11]:
predictions.show(1000)

+----------+------+
|prediction|attack|
+----------+------+
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       1.0|   1.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       0.0|   0.0|
|       1.0|   1.0|
|       1.0|   1.0|
+----------+------+



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="attack",
                                              predictionCol="prediction",
                                              metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 1
