In [2]:
import time
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Classification").master("local[4]").config("spark.executor.cores", 4).getOrCreate()

In [3]:
path ="data/"
train_mnist = spark.read.csv(path+'mnist_train.csv',inferSchema=True)
test_mnist = spark.read.csv(path+'mnist_test.csv',inferSchema=True)

In [4]:
train_mnist.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: integer (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _

In [5]:
# assemble those features to a vector to consume in Spark
train_assembler = VectorAssembler(
    inputCols=train_mnist.columns[1:785],
    outputCol="features")

# assemble those features to a vector to consume in Spark
test_assembler = VectorAssembler(
    inputCols=test_mnist.columns[1:785],
    outputCol="features")

In [6]:
# Transform pixel0,pixel1...pixel783 to one column named "features"
train_final_data = train_assembler.transform(train_mnist).select("_c0", "features").withColumnRenamed("_c0","label")

# Transform pixel0,pixel1...pixel783 to one column named "features"
test_final_data = test_assembler.transform(test_mnist).select("_c0", "features").withColumnRenamed("_c0","label")

In [7]:
train_final_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    5|(784,[152,153,154...|
|    0|(784,[127,128,129...|
|    4|(784,[160,161,162...|
|    1|(784,[158,159,160...|
|    9|(784,[208,209,210...|
|    2|(784,[155,156,157...|
|    1|(784,[124,125,126...|
|    3|(784,[151,152,153...|
|    1|(784,[152,153,154...|
|    4|(784,[134,135,161...|
|    3|(784,[123,124,125...|
|    5|(784,[216,217,218...|
|    3|(784,[143,144,145...|
|    6|(784,[72,73,74,99...|
|    1|(784,[151,152,153...|
|    7|(784,[211,212,213...|
|    2|(784,[151,152,153...|
|    8|(784,[159,160,161...|
|    6|(784,[100,101,102...|
|    9|(784,[209,210,211...|
+-----+--------------------+
only showing top 20 rows



In [8]:
MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",

In [9]:
start_time = time.time()

# Add parameters of your choice here:
classifier = DecisionTreeClassifier(labelCol='label',featuresCol='features',maxDepth=3,maxBins=4)

fitModel = classifier.fit(train_final_data)

print("--- %s seconds ---" % (time.time() - start_time))
print(" ")

predictions = fitModel.transform(test_final_data)

accuracy = (MC_evaluator.evaluate(predictions))*100
print(" ")
print("Accuracy: ",accuracy)

--- 30.27317190170288 seconds ---
 
 
Accuracy:  45.81


In [10]:
fitModel.numClasses

10