In [1]:
## Spark Lib
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils

from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.linalg import Vectors
from pyspark.mllib.util import MLUtils

#import pyarrow

## SKLearn Lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

import time
start_time = time.time()
%matplotlib inline

## Configure parameters

In [2]:
# Path to dataset file
#data_path='/data/biodata/Iris/'
%store -r path

# Sample of train and test dataset
train_sample = 0.7
test_sample = 0.3

## Neural network and SVM using Spark

In [3]:
# Create Spark Session
spark = SparkSession.builder \
        .master("local[8]") \
        .appName("MachineLearningIris") \
        .getOrCreate()

# Enable Arrow-based columnar data transfers
#spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Reading Data

In [4]:
# Load Iris CSV dataset to Spark Dataframe
orig_data = spark.read.format("csv").options(sep=',',header='true',inferschema='true').\
            load(path)

print("Original Dataframe read from CSV file")
#orig_data.dtypes
orig_data.show(5)

Original Dataframe read from CSV file
+------------+-----------+------------+-----------+-----------+
|sepal length|sepal width|petal length|petal width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



### Create Classifier Matrix

In [5]:
# ML libraries doesn't accept string column => everything should be numeric! 
# create a numeric column "label" based on string column "class" 

indexer = StringIndexer(inputCol="class", outputCol="label").fit(orig_data)
label_data = indexer.transform(orig_data)

# Save the inverse map from numeric "label" to string "class" to be used further in response
labelReverse = IndexToString().setInputCol("label")

# Show labeled dataframe with numeric lable
print("Dataframe with numeric lable")
label_data.show(5)

Dataframe with numeric lable
+------------+-----------+------------+-----------+-----------+-----+
|sepal length|sepal width|petal length|petal width|      class|label|
+------------+-----------+------------+-----------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|  0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|  0.0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|  0.0|
+------------+-----------+------------+-----------+-----------+-----+
only showing top 5 rows



In [6]:
# Drop string column "class", no string column
label_data = label_data.drop("class")

# Most Machine Learning Lib inpute 2 columns: label (output) and feature (input)
# The label column is the result to train ML algorithm 
# The feature column should join all parameters as a Vector

# Set the column names that is not part of features list
ignore = ['label']
# list will be all columns parts of features
list = [x for x in label_data.columns if x not in ignore]

# VectorAssembler mount the vector of features
assembler = VectorAssembler(
            inputCols=list,
            outputCol='features')

# Create final dataframe composed by label and a column of features vector
data = (assembler.transform(label_data).select("label","features"))

print("Final Dataframe suitable to classifier input format")
#data.printSchema()
data.show(5)

Final Dataframe suitable to classifier input format
+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
+-----+-----------------+
only showing top 5 rows



### Create Train and Test Dataset

In [7]:
# Split ramdomly the dataset into train and test group
# [0.7,0.3] => 70% for train and 30% for test
# [1.0,0.2] => 100% for train and 20% for test, not good, acuracy always 100%
# [0.1,0.02] => 10% for train and 2% for test, if big datasets
# 1234 is the random seed

(train, test) = data.randomSplit([train_sample, test_sample], 1234)

## Run Perceptron

In [8]:
start_time_pr =  time.time()
# specify layers for the neural network
# parameter 1: input layer, should be the number of features
# parameter 2 and 3: the number os perceptron in two intermediate layers
# parameter 4: output layer should be the number os categories (labels)
layers = [4, 5, 5, 3]

# Create the trainer and set its parameters
# featuresCol=name_feature_column, labelCol=name_label_column
# maxIter=max_interaction, layers=list_number_perceptron 

trainer = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label',\
          maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model and get the result
model = trainer.fit(train)
result_pr = model.transform(test)

print("Perceptron Final Result")
result_pr.show(5)

Perceptron Final Result
+-----+-----------------+--------------------+--------------------+----------+
|label|         features|       rawPrediction|         probability|prediction|
+-----+-----------------+--------------------+--------------------+----------+
|  0.0|[4.3,3.0,1.1,0.1]|[125.163773163649...|[1.0,7.0597849525...|       0.0|
|  0.0|[4.4,2.9,1.4,0.2]|[125.156306784322...|[1.0,7.1231710288...|       0.0|
|  0.0|[4.4,3.0,1.3,0.2]|[125.163112010735...|[1.0,7.0653749965...|       0.0|
|  0.0|[4.8,3.1,1.6,0.2]|[125.153911586500...|[1.0,7.1436254650...|       0.0|
|  0.0|[5.0,3.5,1.6,0.6]|[125.164148267820...|[1.0,7.0566154156...|       0.0|
+-----+-----------------+--------------------+--------------------+----------+
only showing top 5 rows



In [9]:
# compute accuracy on the test set against model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",\
            metricName="accuracy")

accuracy_pr = evaluator.evaluate(result_pr) * 100
time_pr = time.time() - start_time_pr

print("Multilayer Perceptron: accuracy = %3.1f %%" % accuracy_pr)
print("Multilayer Perceptron: time = %3.3f s" % time_pr)

Multilayer Perceptron: accuracy = 97.5 %
Multilayer Perceptron: time = 6.031 s


In [10]:
print("Perceptron final result with name of class")
labelReverse.transform(result_pr).show()

Perceptron final result with name of class
+-----+-----------------+--------------------+--------------------+----------+----------------------------------+
|label|         features|       rawPrediction|         probability|prediction|IndexToString_7f4f1ca1ab75__output|
+-----+-----------------+--------------------+--------------------+----------+----------------------------------+
|  0.0|[4.3,3.0,1.1,0.1]|[125.163773163649...|[1.0,7.0597849525...|       0.0|                       Iris-setosa|
|  0.0|[4.4,2.9,1.4,0.2]|[125.156306784322...|[1.0,7.1231710288...|       0.0|                       Iris-setosa|
|  0.0|[4.4,3.0,1.3,0.2]|[125.163112010735...|[1.0,7.0653749965...|       0.0|                       Iris-setosa|
|  0.0|[4.8,3.1,1.6,0.2]|[125.153911586500...|[1.0,7.1436254650...|       0.0|                       Iris-setosa|
|  0.0|[5.0,3.5,1.6,0.6]|[125.164148267820...|[1.0,7.0566154156...|       0.0|                       Iris-setosa|
|  0.0|[5.0,3.6,1.4,0.2]|[125.164150423534...