In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=9370c3dbe1ba754de95b4bc5083fb8c735d9ae24805d417f8d873b91c77e7f7c
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.master('local').appName('dl').getOrCreate()

In [44]:
# import the dataset
dataset = spark.read.csv("data_banknote_authentication.csv",header=True)

In [45]:
dataset.show(5)

+---------+---------+---------+---------+------+
|feature_1|feature_2|feature_3|feature_4|Class |
+---------+---------+---------+---------+------+
|   3.6216|   8.6661|  -2.8073| -0.44699|     0|
|   4.5459|   8.1674|  -2.4586|  -1.4621|     0|
|    3.866|  -2.6383|   1.9242|  0.10645|     0|
|   3.4566|   9.5228|  -4.0112|  -3.5944|     0|
|  0.32924|  -4.4552|   4.5718|  -0.9888|     0|
+---------+---------+---------+---------+------+
only showing top 5 rows



In [46]:
dataset.printSchema()

root
 |-- feature_1: string (nullable = true)
 |-- feature_2: string (nullable = true)
 |-- feature_3: string (nullable = true)
 |-- feature_4: string (nullable = true)
 |-- Class : string (nullable = true)



In [47]:
for col in dataset.columns:
  dataset = dataset.withColumn(col,dataset[col].cast('double'))

In [48]:
dataset.printSchema()

root
 |-- feature_1: double (nullable = true)
 |-- feature_2: double (nullable = true)
 |-- feature_3: double (nullable = true)
 |-- feature_4: double (nullable = true)
 |-- Class : double (nullable = true)



In [49]:
# input column
input_cols = dataset.columns[:-1]

In [50]:
dataset.select(input_cols).show()

+---------+---------+---------+---------+
|feature_1|feature_2|feature_3|feature_4|
+---------+---------+---------+---------+
|   3.6216|   8.6661|  -2.8073| -0.44699|
|   4.5459|   8.1674|  -2.4586|  -1.4621|
|    3.866|  -2.6383|   1.9242|  0.10645|
|   3.4566|   9.5228|  -4.0112|  -3.5944|
|  0.32924|  -4.4552|   4.5718|  -0.9888|
|     null|   9.6718|  -3.9606|  -3.1625|
|   3.5912|   3.0129|  0.72888|  0.56421|
|   2.0922|    -6.81|   8.4636| -0.60216|
|   3.2032|   5.7588| -0.75345| -0.61251|
|   1.5356|   9.1772|  -2.2718| -0.73535|
|   1.2247|   8.7779|  -2.2135| -0.80647|
|   3.9899|  -2.7066|   2.3946|  0.86291|
|   1.8993|   7.6625|  0.15394|  -3.1108|
|  -1.5768|   10.843|   2.5462|  -2.9362|
|    3.404|   8.7261|  -2.9915| -0.57242|
|   4.6765|  -3.3895|   3.4896|   1.4771|
|   2.6719|   3.0646|  0.37158|  0.58619|
|  0.80355|   2.8473|   4.3439|   0.6017|
|   1.4479|     null|   8.3428|  -2.1086|
|   5.2423|  11.0272|   -4.353|  -4.1013|
+---------+---------+---------+---

In [51]:
from pyspark.ml.feature import Imputer, MinMaxScaler

In [52]:
imputed_col = ['f_{}'.format(i+1) for i in range(4)]

In [53]:
imputed_col

['f_1', 'f_2', 'f_3', 'f_4']

In [54]:
model = Imputer(strategy='mean',
                missingValue=None,
                inputCols=input_cols,
                outputCols=imputed_col).fit(dataset)

In [55]:
impute_data = model.transform(dataset)

In [56]:
impute_data.show()

+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|feature_1|feature_2|feature_3|feature_4|Class |               f_1|               f_2|     f_3|     f_4|
+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0|            3.6216|            8.6661| -2.8073|-0.44699|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0|            4.5459|            8.1674| -2.4586| -1.4621|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|             3.866|           -2.6383|  1.9242| 0.10645|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0|            3.4566|            9.5228| -4.0112| -3.5944|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|           0.32924|           -4.4552|  4.5718| -0.9888|
|     null|   9.6718|  -3.9606|  -3.1625|   0.0|0.4308653338439095|            9.6718| -3.9606| -3.1625|
|   3.5912|   3.0129|  0.72888|  0.56421|   0.0|       

In [57]:
assemble = VectorAssembler(inputCols=imputed_col, 
                           outputCol='assembled_features')

In [58]:
a_data = assemble.transform(impute_data)

In [59]:
a_data.show(5)

+---------+---------+---------+---------+------+-------+-------+-------+--------+--------------------+
|feature_1|feature_2|feature_3|feature_4|Class |    f_1|    f_2|    f_3|     f_4|  assembled_features|
+---------+---------+---------+---------+------+-------+-------+-------+--------+--------------------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0| 3.6216| 8.6661|-2.8073|-0.44699|[3.6216,8.6661,-2...|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0| 4.5459| 8.1674|-2.4586| -1.4621|[4.5459,8.1674,-2...|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|  3.866|-2.6383| 1.9242| 0.10645|[3.866,-2.6383,1....|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0| 3.4566| 9.5228|-4.0112| -3.5944|[3.4566,9.5228,-4...|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|0.32924|-4.4552| 4.5718| -0.9888|[0.32924,-4.4552,...|
+---------+---------+---------+---------+------+-------+-------+-------+--------+--------------------+
only showing top 5 rows



In [60]:
scaler = MinMaxScaler(min=0.0, max=1.0, 
                      inputCol='assembled_features', 
                      outputCol='features')

In [61]:
s_data = scaler.fit(a_data).transform(a_data)

In [62]:
s_data.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[0.76900388695382...|
|[0.83565901535310...|
|[0.78662859038429...|
|[0.75710504871312...|
|[0.53157807440740...|
+--------------------+
only showing top 5 rows



In [63]:
s_data = s_data.withColumnRenamed('Class','label')

In [67]:
dataset.columns

['feature_1', 'feature_2', 'feature_3', 'feature_4', 'Class ']

In [68]:
a = ['Class ','features']
s_data.select(*a).show(5)

+------+--------------------+
|Class |            features|
+------+--------------------+
|   0.0|[0.76900388695382...|
|   0.0|[0.83565901535310...|
|   0.0|[0.78662859038429...|
|   0.0|[0.75710504871312...|
|   0.0|[0.53157807440740...|
+------+--------------------+
only showing top 5 rows



In [75]:
# Cross validation
train_df, test_df = s_data.select('Class ','features').randomSplit([0.75,0.25],seed=0)

In [76]:
train_df.count()

1013

In [78]:
test_df.count()

359

In [105]:
mlpc=MultilayerPerceptronClassifier(featuresCol='features',
                                    labelCol='Class ', 
                                    layers = [4,32,2],  maxIter=600,
                                    blockSize=8, seed=0, solver='gd')

In [106]:
ann = mlpc.fit(train_df)

In [107]:
pred = ann.transform(test_df)

In [108]:
evaluator = MulticlassClassificationEvaluator(labelCol='Class ',
                                              predictionCol='prediction',
                                              metricName='accuracy')
evaluator.evaluate(pred)

0.8245125348189415