In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.master("local").appName("Test Spark").config("spark.some.config.option", "some-value").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
spark

In [5]:
sqlcontext = SQLContext(sc)



In [6]:
#numeric_cols = ["{}".format(x) for x in range(2,32)]

In [7]:
#numeric_cols

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [9]:
letter_recognition_df = sqlcontext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('wpbc.csv')
feature_columns = ['Time','Mean Radius','Mean Texture','Mean Perimeter','Mean Area','Mean Smoothness','Mean Compactness','Mean Concavity','Mean Concave Points','Mean Symmetry','Mean Fractal Dimension','Radius SE','Texture SE','Perimeter SE','SE Area','SE Smoothness','SE Compactness','SE Concavity','SE Concave Points','SE Symmetry','SE Fractal Dimension','Worst Radius','Worst Texture','Worst Perimeter','Worst Area','Worst Smoothness','Worst Compactness','Worst Concavity','Worst Concave Points','Worst Symmetry','Worst Fractal Dimension','Tumor size','Lymph node status']
#feature_columns = ['Mean Radius','Mean Texture','Mean Perimeter']
vector_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features')
vectorised_df = vector_assembler.transform(letter_recognition_df).withColumnRenamed('Outcome', 'label').select('label', 'features')
vectorised_df.show(1, False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                        |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[31.0,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,0.1865,0.06333,0.6249,1.89,3.972,71.55,0.004433,0.01421,0.03233,0.009854,0.01694,0.003495,21.63,37.08,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5.0]|
+-----+-------------------------------------------------------------

In [10]:
#letter_recognition_df.printSchema()

In [11]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withMean=True, withStd=True)
#scaler = StandardScaler(inputCol='features', outputCol='features', withMean=True, withStd=True)
scaler_model = scaler.fit(vectorised_df)
vectorised_df = scaler_model.transform(vectorised_df)

In [12]:
vectorised_df = vectorised_df.select("scaledFeatures","label")
vectorised_df = vectorised_df.withColumnRenamed("scaledFeatures", "features")

In [13]:
#vectorised_df.write.mode("overwrite").csv("wdbc_normalized.csv", header=True)

In [14]:
train_df, test_df = vectorised_df.randomSplit([0.8, 0.2], seed=12345)
train_df.count(), test_df.count()

(167, 27)

In [15]:
train_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)



In [16]:
#from pyspark.ml.feature import VectorAssembler
#feature_columns = ['Mean Radius','Mean Texture','Mean Perimeter']
#assembler = VectorAssembler(inputCols = feature_columns, outputCol='features')
#vectorised_df = assembler.transform(train_df).withColumnRenamed('Outcome', 'label').select('label', 'features')
#train_df = assembler.transform(train_df)
#test_df = assembler.transform(test_df)

In [17]:
train_df.show(1)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-1.3306283967537...|    0|
+--------------------+-----+
only showing top 1 row



In [18]:
layers = [train_df.schema['features'].metadata['ml_attr']['num_attrs'], 128, 64, 32, 16 , 2]

In [19]:
mlp = MultilayerPerceptronClassifier(layers=layers, seed=42)

In [20]:
model = mlp.fit(train_df)

In [21]:
test_predictions_df = model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_predictions_df.select("label", "features", "probability", "prediction").show()

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+-----+--------------------+--------------------+----------+
|label|            features|         probability|prediction|
+-----+--------------------+--------------------+----------+
|    0|[-1.2726970886365...|[0.99999999994896...|       0.0|
|    1|[-1.0989031642849...|[1.79426719706482...|       1.0|
|    1|[-1.0989031642849...|[0.99999999996569...|       0.0|
|    1|[-1.0699375102263...|[0.28493391318038...|       1.0|
|    1|[-1.0409718561677...|[1.85770919155627...|       1.0|
|    1|[-0.9540748939919...|[0.99877739773863...|       0.0|
|    0|[-0.9251092399333...|[0.99999999988153...|       0.0|
|    1|[-0.6064870452887...|[1.36717813813749...|       1.0|
|    0|[-0.2588991965856...|[2.04538004831459...|       1.0|
|    1|[-0.2299335425270...|[0.99999988151933...|       0.0|
|    1|[-0.2009678884684...|[0.99999999994181...|       0.0|
|    0|[0.17558561429335...|[0.99999999991079...|       0.0|
|    0|[0.26248257646914...|[0.999999

In [22]:
prediction_and_labels = test_predictions_df.select("prediction", "label")
accuracy_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("Accuracy on Test Dataset = %g" % accuracy_evaluator.evaluate(prediction_and_labels))
print("Precision on Test Dataset = %g" % precision_evaluator.evaluate(prediction_and_labels))
print("Recall on Test Dataset = %g" % recall_evaluator.evaluate(prediction_and_labels))

Accuracy on Test Dataset = 0.777778
Precision on Test Dataset = 0.781818
Recall on Test Dataset = 0.777778
