## Create Session

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("AppName") \
    .getOrCreate()

## Step 1. Load Data

In [2]:
# load the dataset
data = spark.read.format("libsvm").load("sample_libsvm_classification_data.txt")
data.dtypes

[('label', 'double'), ('features', 'vector')]

## Step 2. Data Preparation

In [3]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=0)


## Step 3. Training Pipeline 

In [4]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)


## Step 4. Prediction

In [5]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.show(5)


+-----+--------------------+------------+--------------------+-------------+-----------+----------+--------------+
|label|            features|indexedLabel|     indexedFeatures|rawPrediction|probability|prediction|predictedLabel|
+-----+--------------------+------------+--------------------+-------------+-----------+----------+--------------+
|  0.0|(692,[95,96,97,12...|         1.0|(692,[95,96,97,12...|   [0.0,10.0]|  [0.0,1.0]|       1.0|           0.0|
|  0.0|(692,[121,122,123...|         1.0|(692,[121,122,123...|   [0.0,10.0]|  [0.0,1.0]|       1.0|           0.0|
|  0.0|(692,[122,123,124...|         1.0|(692,[122,123,124...|   [0.0,10.0]|  [0.0,1.0]|       1.0|           0.0|
|  0.0|(692,[122,123,148...|         1.0|(692,[122,123,148...|    [2.0,8.0]|  [0.2,0.8]|       1.0|           0.0|
|  0.0|(692,[123,124,125...|         1.0|(692,[123,124,125...|   [0.0,10.0]|  [0.0,1.0]|       1.0|           0.0|
+-----+--------------------+------------+--------------------+-------------+----

## Step 5. Evaluation

In [6]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only

Test Error = 0
RandomForestClassificationModel (uid=RandomForestClassifier_95be9fd266d3) with 10 trees
