In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("spark://spark-master:7077")\
                            .appName("convtype")\
                            .config("spark.executor.memory", "6g")\
                            .getOrCreate()

In [3]:
data = spark.sparkContext.textFile("./data/covtype.data.gz")

In [4]:
print(data.first())
data.count()

2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


581012

In [5]:
raw = data.map(lambda x: x.replace("'","").split(','))
raw.first()
len(raw.first())

55

In [6]:
rawDF = spark.createDataFrame(data=raw)

In [7]:
rawDF.show(5)

+----+---+---+---+---+----+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  _1| _2| _3| _4| _5|  _6| _7| _8| _9| _10|_11|_12|_13|_14|_15|_16|_17|_18|_19|_20|_21|_22|_23|_24|_25|_26|_27|_28|_29|_30|_31|_32|_33|_34|_35|_36|_37|_38|_39|_40|_41|_42|_43|_44|_45|_46|_47|_48|_49|_50|_51|_52|_53|_54|_55|
+----+---+---+---+---+----+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|2596| 51|  3|258|  0| 510|221|232|148|6279|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  5|
|2590| 56|  2|212| -6| 390|220|235|151|6225|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|

In [8]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

In [9]:
schema = StructType([StructField("Elevation", StringType(), True),
                    StructField("Aspect", StringType(), True),
                    StructField("Slope", StringType(), True),
                    StructField("Horizontal_Distance_To_Hydrology", StringType(), True),
                    StructField("Vertical_Distance_To_Hydrology", StringType(), True),                     
                    StructField("Horizontal_Distance_To_RoadWays", StringType(), True),
                    StructField("Hillshade_9am", StringType(), True),
                    StructField("Hillshade_Noon", StringType(), True),
                    StructField("Hillshade_3pm", StringType(), True),
                    StructField("Horizontal_Distance_To_ire_Points", StringType(), True),
                    *[StructField("Wilderness_Area_" + str(i), StringType(), True) for i in range(1,5)],                     
])
for i in range(1, 41):
    schema.add(StructField(f"Soil_Type_{i}", StringType(), True))

schema.add(StructField("Cover_Type", StringType(), True))

StructType(List(StructField(Elevation,StringType,true),StructField(Aspect,StringType,true),StructField(Slope,StringType,true),StructField(Horizontal_Distance_To_Hydrology,StringType,true),StructField(Vertical_Distance_To_Hydrology,StringType,true),StructField(Horizontal_Distance_To_RoadWays,StringType,true),StructField(Hillshade_9am,StringType,true),StructField(Hillshade_Noon,StringType,true),StructField(Hillshade_3pm,StringType,true),StructField(Horizontal_Distance_To_ire_Points,StringType,true),StructField(Wilderness_Area_1,StringType,true),StructField(Wilderness_Area_2,StringType,true),StructField(Wilderness_Area_3,StringType,true),StructField(Wilderness_Area_4,StringType,true),StructField(Soil_Type_1,StringType,true),StructField(Soil_Type_2,StringType,true),StructField(Soil_Type_3,StringType,true),StructField(Soil_Type_4,StringType,true),StructField(Soil_Type_5,StringType,true),StructField(Soil_Type_6,StringType,true),StructField(Soil_Type_7,StringType,true),StructField(Soil_Type_8

In [10]:
schemedDF = spark.createDataFrame(data=raw, schema=schema)

In [11]:
schemedDF.printSchema()

root
 |-- Elevation: string (nullable = true)
 |-- Aspect: string (nullable = true)
 |-- Slope: string (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: string (nullable = true)
 |-- Vertical_Distance_To_Hydrology: string (nullable = true)
 |-- Horizontal_Distance_To_RoadWays: string (nullable = true)
 |-- Hillshade_9am: string (nullable = true)
 |-- Hillshade_Noon: string (nullable = true)
 |-- Hillshade_3pm: string (nullable = true)
 |-- Horizontal_Distance_To_ire_Points: string (nullable = true)
 |-- Wilderness_Area_1: string (nullable = true)
 |-- Wilderness_Area_2: string (nullable = true)
 |-- Wilderness_Area_3: string (nullable = true)
 |-- Wilderness_Area_4: string (nullable = true)
 |-- Soil_Type_1: string (nullable = true)
 |-- Soil_Type_2: string (nullable = true)
 |-- Soil_Type_3: string (nullable = true)
 |-- Soil_Type_4: string (nullable = true)
 |-- Soil_Type_5: string (nullable = true)
 |-- Soil_Type_6: string (nullable = true)
 |-- Soil_Type_7: string (nullable 

In [12]:
schemedDF.head

<bound method DataFrame.head of DataFrame[Elevation: string, Aspect: string, Slope: string, Horizontal_Distance_To_Hydrology: string, Vertical_Distance_To_Hydrology: string, Horizontal_Distance_To_RoadWays: string, Hillshade_9am: string, Hillshade_Noon: string, Hillshade_3pm: string, Horizontal_Distance_To_ire_Points: string, Wilderness_Area_1: string, Wilderness_Area_2: string, Wilderness_Area_3: string, Wilderness_Area_4: string, Soil_Type_1: string, Soil_Type_2: string, Soil_Type_3: string, Soil_Type_4: string, Soil_Type_5: string, Soil_Type_6: string, Soil_Type_7: string, Soil_Type_8: string, Soil_Type_9: string, Soil_Type_10: string, Soil_Type_11: string, Soil_Type_12: string, Soil_Type_13: string, Soil_Type_14: string, Soil_Type_15: string, Soil_Type_16: string, Soil_Type_17: string, Soil_Type_18: string, Soil_Type_19: string, Soil_Type_20: string, Soil_Type_21: string, Soil_Type_22: string, Soil_Type_23: string, Soil_Type_24: string, Soil_Type_25: string, Soil_Type_26: string, S

In [15]:
from pyspark.sql.functions import col

for colName in schemedDF.columns:
    schemedDF = schemedDF.withColumn(colName, col(colName).cast('double'))

In [16]:
schemedDF.head()

Row(Elevation=2596.0, Aspect=51.0, Slope=3.0, Horizontal_Distance_To_Hydrology=258.0, Vertical_Distance_To_Hydrology=0.0, Horizontal_Distance_To_RoadWays=510.0, Hillshade_9am=221.0, Hillshade_Noon=232.0, Hillshade_3pm=148.0, Horizontal_Distance_To_ire_Points=6279.0, Wilderness_Area_1=1.0, Wilderness_Area_2=0.0, Wilderness_Area_3=0.0, Wilderness_Area_4=0.0, Soil_Type_1=0.0, Soil_Type_2=0.0, Soil_Type_3=0.0, Soil_Type_4=0.0, Soil_Type_5=0.0, Soil_Type_6=0.0, Soil_Type_7=0.0, Soil_Type_8=0.0, Soil_Type_9=0.0, Soil_Type_10=0.0, Soil_Type_11=0.0, Soil_Type_12=0.0, Soil_Type_13=0.0, Soil_Type_14=0.0, Soil_Type_15=0.0, Soil_Type_16=0.0, Soil_Type_17=0.0, Soil_Type_18=0.0, Soil_Type_19=0.0, Soil_Type_20=0.0, Soil_Type_21=0.0, Soil_Type_22=0.0, Soil_Type_23=0.0, Soil_Type_24=0.0, Soil_Type_25=0.0, Soil_Type_26=0.0, Soil_Type_27=0.0, Soil_Type_28=0.0, Soil_Type_29=1.0, Soil_Type_30=0.0, Soil_Type_31=0.0, Soil_Type_32=0.0, Soil_Type_33=0.0, Soil_Type_34=0.0, Soil_Type_35=0.0, Soil_Type_36=0.0, So

In [17]:
trainData, testData = schemedDF.randomSplit(weights=[0.9, 0.1], seed=13)

In [18]:
print('Train Data Count - ', trainData.count())
print('Test Data Count - ', testData.count())

Train Data Count -  523063
Test Data Count -  57949


In [19]:
trainData.cache()
testData.cache()

DataFrame[Elevation: double, Aspect: double, Slope: double, Horizontal_Distance_To_Hydrology: double, Vertical_Distance_To_Hydrology: double, Horizontal_Distance_To_RoadWays: double, Hillshade_9am: double, Hillshade_Noon: double, Hillshade_3pm: double, Horizontal_Distance_To_ire_Points: double, Wilderness_Area_1: double, Wilderness_Area_2: double, Wilderness_Area_3: double, Wilderness_Area_4: double, Soil_Type_1: double, Soil_Type_2: double, Soil_Type_3: double, Soil_Type_4: double, Soil_Type_5: double, Soil_Type_6: double, Soil_Type_7: double, Soil_Type_8: double, Soil_Type_9: double, Soil_Type_10: double, Soil_Type_11: double, Soil_Type_12: double, Soil_Type_13: double, Soil_Type_14: double, Soil_Type_15: double, Soil_Type_16: double, Soil_Type_17: double, Soil_Type_18: double, Soil_Type_19: double, Soil_Type_20: double, Soil_Type_21: double, Soil_Type_22: double, Soil_Type_23: double, Soil_Type_24: double, Soil_Type_25: double, Soil_Type_26: double, Soil_Type_27: double, Soil_Type_2

In [20]:
trainInputCols = trainData.columns
trainInputCols.remove('Cover_Type')
print(trainInputCols)

['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_RoadWays', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_ire_Points', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Wilderness_Area_4', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39', 'Soil_Type_40']


In [21]:
trainData.select("Cover_Type").distinct().show()

+----------+
|Cover_Type|
+----------+
|       7.0|
|       1.0|
|       4.0|
|       3.0|
|       2.0|
|       6.0|
|       5.0|
+----------+



In [22]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [23]:
assembler = VectorAssembler(inputCols=trainInputCols,
                            outputCol="trainFeatureVector")

assembledTrainData = assembler.transform(trainData)
assembledTrainData.select("trainFeatureVector").show(5, truncate=False)

+----------------------------------------------------------------------------------------------------+
|trainFeatureVector                                                                                  |
+----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1859.0,18.0,12.0,67.0,11.0,90.0,211.0,215.0,139.0,792.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1861.0,35.0,14.0,60.0,11.0,85.0,218.0,209.0,124.0,832.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1866.0,23.0,14.0,85.0,16.0,108.0,212.0,210.0,133.0,819.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1867.0,20.0,15.0,108.0,19.0,120.0,208.0,206.0,132.0,808.0,1.0,1.0])|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [24]:
from pyspark.ml.classification import DecisionTreeClassifier

In [25]:
classifier = DecisionTreeClassifier(seed=13,
                                    featuresCol="trainFeatureVector",
                                    labelCol="Cover_Type",
                                    predictionCol="prediction")

In [26]:
model = classifier.fit(assembledTrainData)

In [27]:
import pprint
pprint.pprint(model.toDebugString)

('DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5c168896ad23, '
 'depth=5, numNodes=43, numClasses=8, numFeatures=54\n'
 '  If (feature 0 <= 3052.5)\n'
 '   If (feature 0 <= 2540.5)\n'
 '    If (feature 10 <= 0.5)\n'
 '     If (feature 0 <= 2429.5)\n'
 '      If (feature 3 <= 15.0)\n'
 '       Predict: 4.0\n'
 '      Else (feature 3 > 15.0)\n'
 '       Predict: 3.0\n'
 '     Else (feature 0 > 2429.5)\n'
 '      Predict: 3.0\n'
 '    Else (feature 10 > 0.5)\n'
 '     If (feature 9 <= 4633.0)\n'
 '      Predict: 2.0\n'
 '     Else (feature 9 > 4633.0)\n'
 '      If (feature 5 <= 877.0)\n'
 '       Predict: 5.0\n'
 '      Else (feature 5 > 877.0)\n'
 '       Predict: 2.0\n'
 '   Else (feature 0 > 2540.5)\n'
 '    If (feature 0 <= 2957.5)\n'
 '     If (feature 15 <= 0.5)\n'
 '      If (feature 17 <= 0.5)\n'
 '       Predict: 2.0\n'
 '      Else (feature 17 > 0.5)\n'
 '       Predict: 3.0\n'
 '     Else (feature 15 > 0.5)\n'
 '      Predict: 3.0\n'
 '    Else (feature 0 > 2957

In [28]:
for c, i  in zip(trainInputCols, model.featureImportances):
    print(c, i)

Elevation 0.8089191402843596
Aspect 0.0
Slope 0.0
Horizontal_Distance_To_Hydrology 0.03784006969010683
Vertical_Distance_To_Hydrology 0.0
Horizontal_Distance_To_RoadWays 0.006506837799972758
Hillshade_9am 0.0
Hillshade_Noon 0.025836882606185087
Hillshade_3pm 0.0
Horizontal_Distance_To_ire_Points 0.0004282934147880387
Wilderness_Area_1 0.019998876874341355
Wilderness_Area_2 0.0
Wilderness_Area_3 0.011843193242447668
Wilderness_Area_4 0.0
Soil_Type_1 0.0
Soil_Type_2 0.027341994298035327
Soil_Type_3 0.0
Soil_Type_4 0.026801367923066442
Soil_Type_5 0.0
Soil_Type_6 0.0
Soil_Type_7 0.0
Soil_Type_8 0.0
Soil_Type_9 0.0
Soil_Type_10 0.0
Soil_Type_11 0.0
Soil_Type_12 0.0
Soil_Type_13 0.0
Soil_Type_14 0.0
Soil_Type_15 0.0
Soil_Type_16 0.0
Soil_Type_17 0.0
Soil_Type_18 0.0
Soil_Type_19 0.0
Soil_Type_20 0.0
Soil_Type_21 0.0
Soil_Type_22 0.0
Soil_Type_23 0.005626083224378807
Soil_Type_24 0.0
Soil_Type_25 0.0
Soil_Type_26 0.0
Soil_Type_27 0.0
Soil_Type_28 0.0
Soil_Type_29 0.0
Soil_Type_30 0.0
Soil_Ty

In [29]:
prediction = model.transform(assembledTrainData)
prediction.select("Cover_Type", "prediction", "probability").show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                    |
+----------+----------+-----------------------------------------------------------------------------------------------+
|3.0       |3.0       |[0.0,0.0,0.02713077181082995,0.639734655088883,0.05485782432079902,0.0,0.27827674877948794,0.0]|
|3.0       |3.0       |[0.0,0.0,0.02713077181082995,0.639734655088883,0.05485782432079902,0.0,0.27827674877948794,0.0]|
|6.0       |3.0       |[0.0,0.0,0.02713077181082995,0.639734655088883,0.05485782432079902,0.0,0.27827674877948794,0.0]|
|3.0       |3.0       |[0.0,0.0,0.02713077181082995,0.639734655088883,0.05485782432079902,0.0,0.27827674877948794,0.0]|
|3.0       |3.0       |[0.0,0.0,0.02713077181082995,0.639734655088883,0.05485782432079902,0.0,0.27827674877948794,0.0]|
|3.0       |3.0       |[0.0,0.0,0.027130

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type",
                                              predictionCol="prediction")

In [32]:
accuracy = evaluator.setMetricName("accuracy").evaluate(prediction)
f1 = evaluator.setMetricName("f1").evaluate(prediction)

print("ACC - ", accuracy)
print("F1 - ", f1)


ACC -  0.6998870116983996
F1 -  0.6818200208761772


In [33]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [34]:
predictionRDD = prediction.select("prediction", "Cover_Type").rdd

In [35]:
predictionRDD.map(lambda row : [double(c) for c in row])

PythonRDD[124] at RDD at PythonRDD.scala:53

In [36]:
predictionRDD.first()

Row(prediction=3.0, Cover_Type=3.0)

In [37]:
metrics = MulticlassMetrics(predictionRDD)
metrics.confusionMatrix().toArray()

array([[1.26587e+05, 6.00300e+04, 2.03000e+02, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 3.82100e+03],
       [4.62740e+04, 2.00853e+05, 7.28400e+03, 7.80000e+01, 1.05000e+02,
        0.00000e+00, 4.57000e+02],
       [0.00000e+00, 2.57900e+03, 2.90340e+04, 6.07000e+02, 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 1.50700e+03, 9.81000e+02, 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.00000e+00, 7.71900e+03, 7.70000e+02, 0.00000e+00, 1.14000e+02,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 3.38400e+03, 1.17930e+04, 4.63000e+02, 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [9.75200e+03, 9.20000e+01, 5.70000e+01, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 8.51600e+03]])

In [38]:
confusionMatrix = prediction.groupBy("Cover_Type") \
                            .pivot("prediction", range(1,8)) \
                            .count() \
                            .na.fill(0.0) \
                            .orderBy("Cover_Type")

In [39]:
confusionMatrix.show()

+----------+------+------+-----+---+---+---+----+
|Cover_Type|     1|     2|    3|  4|  5|  6|   7|
+----------+------+------+-----+---+---+---+----+
|       1.0|126587| 60030|  203|  0|  0|  0|3821|
|       2.0| 46274|200853| 7284| 78|105|  0| 457|
|       3.0|     0|  2579|29034|607|  0|  0|   0|
|       4.0|     0|     0| 1507|981|  0|  0|   0|
|       5.0|     3|  7719|  770|  0|114|  0|   0|
|       6.0|     0|  3384|11793|463|  0|  0|   0|
|       7.0|  9752|    92|   57|  0|  0|  0|8516|
+----------+------+------+-----+---+---+---+----+



In [40]:
def classProbabilities(df):
    total = df.count()
    res = df.groupBy("Cover_Type") \
            .count() \
            .orderBy("Cover_Type") \
            .withColumn("prob", col("count") / total)
    return res.select("Cover_Type", "prob")

In [41]:
trainPriorProbabilities = classProbabilities(trainData)
testPriorProbabilities = classProbabilities(testData)

In [42]:
priorProbabilities = trainPriorProbabilities.select(col("Cover_Type"), col("prob").alias("train_prob")).join(testPriorProbabilities, on='Cover_Type')
priorProbabilities.show()

+----------+--------------------+--------------------+
|Cover_Type|          train_prob|                prob|
+----------+--------------------+--------------------+
|       7.0|0.035209907793133904| 0.03611796579751161|
|       1.0|  0.3644704366395635|  0.3658216707794785|
|       4.0|0.004756597197660...|0.004469447272601771|
|       3.0| 0.06159869843594366| 0.06098465892422648|
|       2.0| 0.48761047904363336|  0.4874976272239383|
|       6.0|0.029900795888831745|0.029802067335070494|
|       5.0| 0.01645308500123312| 0.01530656266717286|
+----------+--------------------+--------------------+



In [43]:
multipliedProb = priorProbabilities.withColumn("multipliedProb", col("train_prob") * col("prob"))

In [44]:
multipliedProb.show()

+----------+--------------------+--------------------+--------------------+
|Cover_Type|          train_prob|                prob|      multipliedProb|
+----------+--------------------+--------------------+--------------------+
|       7.0|0.035209907793133904| 0.03611796579751161|0.001271710245405...|
|       1.0|  0.3644704366395635|  0.3658216707794785|  0.1333311840812112|
|       4.0|0.004756597197660...|0.004469447272601771|2.125936037194985...|
|       3.0| 0.06159869843594366| 0.06098465892422648|0.003756575614292...|
|       2.0| 0.48761047904363336|  0.4874976272239383| 0.23770895154329916|
|       6.0|0.029900795888831745|0.029802067335070494|8.911055324511627E-4|
|       5.0| 0.01645308500123312| 0.01530656266717286|2.518401766396966E-4|
+----------+--------------------+--------------------+--------------------+



In [45]:
multipliedProb.select("multipliedProb").groupBy().sum().show()

+-------------------+
|sum(multipliedProb)|
+-------------------+
|0.37723262655367135|
+-------------------+



In [46]:
from pyspark.ml import Pipeline, PipelineModel
    
pipeline = Pipeline(stages=[assembler, classifier])

In [47]:
from pyspark.ml.tuning import ParamGridBuilder

In [48]:
paramGrid = ParamGridBuilder().addGrid(classifier.impurity, ["gini", "entropy"]) \
                            .addGrid(classifier.maxDepth, [1, 20]) \
                            .addGrid(classifier.maxBins, [40, 300]) \
                            .addGrid(classifier.minInfoGain, [0.0, 0.05]) \
                            .build()

In [49]:
multiclassEval = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction", metricName="accuracy")

In [50]:
from pyspark.ml.tuning import TrainValidationSplitModel, TrainValidationSplit

In [51]:
validator = TrainValidationSplit(estimator=pipeline, 
                                estimatorParamMaps=paramGrid,
                                evaluator=multiclassEval,
                                trainRatio=0.9,
                                parallelism=4)

In [52]:
validatorModel = validator.fit(trainData)

In [53]:
pprint.pprint(validatorModel.bestModel.stages[-1].extractParamMap())

{Param(parent='DecisionTreeClassifier_5c168896ad23', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='DecisionTreeClassifier_5c168896ad23', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='DecisionTreeClassifier_5c168896ad23', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='DecisionTreeClassifier_5c168896ad23', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',
 Param

In [54]:
paramsAndMetrics = zip( validatorModel.validationMetrics, validatorModel.getEstimatorParamMaps())

In [55]:
for metric, param in paramsAndMetrics:
    print(metric, param)

0.6330914808570292 {Param(parent='DecisionTreeClassifier_5c168896ad23', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini', Param(parent='DecisionTreeClassifier_5c168896ad23', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 1, Param(parent='DecisionTreeClassifier_5c168896ad23', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40, Param(parent='DecisionTreeClassifier_5c168896ad23', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0}
0.6330914808570292 {Param(parent='DecisionTreeClassifier_5c168896ad23', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini', Param(parent='DecisionTreeClassif

In [56]:
max(validatorModel.validationMetrics)

0.9128573322977248

In [57]:
multiclassEval.evaluate(validatorModel.bestModel.transform(testData))

0.9133375899497834

In [58]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [59]:
wildernessCols = [f'Wilderness_Area_{x}' for x in range(1,5) ]
wildernessCols

['Wilderness_Area_1',
 'Wilderness_Area_2',
 'Wilderness_Area_3',
 'Wilderness_Area_4']

In [60]:
trainData.select(wildernessCols).show(5)

+-----------------+-----------------+-----------------+-----------------+
|Wilderness_Area_1|Wilderness_Area_2|Wilderness_Area_3|Wilderness_Area_4|
+-----------------+-----------------+-----------------+-----------------+
|              0.0|              0.0|              0.0|              1.0|
|              0.0|              0.0|              0.0|              1.0|
|              0.0|              0.0|              0.0|              1.0|
|              0.0|              0.0|              0.0|              1.0|
|              0.0|              0.0|              0.0|              1.0|
+-----------------+-----------------+-----------------+-----------------+
only showing top 5 rows



In [160]:
wildernessAssembler = VectorAssembler(inputCols=wildernessCols,
                            outputCol="wilderness")
wildernessVector = wildernessAssembler.transform(trainData)
wildernessVector.select("wilderness").show(5, truncate=False)

+-------------+
|wilderness   |
+-------------+
|(4,[3],[1.0])|
|(4,[3],[1.0])|
|(4,[3],[1.0])|
|(4,[3],[1.0])|
|(4,[3],[1.0])|
+-------------+
only showing top 5 rows



https://issues.apache.org/jira/browse/SPARK-29952

In [172]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType

@udf(returnType=IntegerType())
def vector_to_array(v):
    # convert column of vectors into column of arrays
    a = v.toArray().tolist().index(1.0)
    return int(a)

wildernessUDF = wildernessVector.withColumn('features_array',vector_to_array('wilderness'))
wildernessUDF.select('features_array').show()

+--------------+
|features_array|
+--------------+
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
|             3|
+--------------+
only showing top 20 rows

