### Step 1: Create Spark Session 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .master("local") \
    .appName("appName") \
    .getOrCreate()

### Step 2: Load dataset


It is very common in practice to have sparse training data. MLlib supports reading training examples stored in LIBSVM format, which is the default format used by LIBSVM and LIBLINEAR. It is a text format in which each line represents a labeled sparse feature vector using the following format:


In [2]:
# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("sample_libsvm_linear_regression_data.txt")
data.show(5)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 5 rows



### Step 3: Data Preparation

In [3]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3],seed=0)


### Step 4: Train the Model

In [4]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model. This also runs the indexer.
model = pipeline.fit(trainingData)


### Step 4: Make Predictions

In [5]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)



+--------------------+-------------------+--------------------+
|          prediction|              label|            features|
+--------------------+-------------------+--------------------+
|-0.17221742429203743|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-0.17221742429203743|-26.736207182601724|(10,[0,1,2,3,4,5,...|
| -3.6938082137917143| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|  13.167841814201045|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|  13.167841814201045|-22.837460416919342|(10,[0,1,2,3,4,5,...|
+--------------------+-------------------+--------------------+
only showing top 5 rows



### Step 5: Evaluation

In [6]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 13.0772


In [7]:
treeModel = model.stages[1]
# summary only
print(treeModel)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_91ba126a8d85) of depth 5 with 63 nodes
