In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [3]:
from pyspark.ml import Pipeline

In [4]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [5]:
# We can use the same models but for Regression
#from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, DecisionTreeRegressor

In [6]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [9]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [12]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [13]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|  [70.0,30.0]|  [0.7,0.3]|       0.0|
|  0.0|(692,[121,122,123...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[123,124,125...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[124,125,126...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[126,127,128...|  [89.0,11.0]|[0.89,0.11]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[126,127,128...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[127,128,129...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[128,129,130...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[150,151,152...|  [89.0,11.0]|[0.89,0.11]|       0.0|
|  0.0|(692,[151,152,153...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(69

In [14]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[100,101,102...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.16456169933188...|[0.91126047608590...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[128

### Use an evaluator

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [17]:
print('DTC ACCURACY')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY


0.9696969696969697

In [18]:
print('RFC ACCURACY')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY


1.0

In [19]:
print('GBT ACCURACY')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY


0.9696969696969697

## Grab feature importance

In [20]:
rfc_model.featureImportances

#The higher the number the more important it was

SparseVector(692, {121: 0.0006, 147: 0.0007, 186: 0.0006, 190: 0.0008, 205: 0.002, 207: 0.0006, 214: 0.0021, 215: 0.0006, 217: 0.0006, 235: 0.0061, 237: 0.0006, 242: 0.0009, 243: 0.0012, 244: 0.0214, 263: 0.0073, 271: 0.0071, 272: 0.0263, 273: 0.0083, 274: 0.0003, 287: 0.0014, 289: 0.0076, 290: 0.0075, 291: 0.0017, 294: 0.0007, 299: 0.001, 300: 0.0102, 301: 0.0093, 302: 0.0038, 314: 0.0029, 317: 0.0085, 322: 0.0052, 324: 0.0016, 327: 0.0047, 328: 0.0082, 329: 0.0005, 341: 0.0005, 344: 0.0004, 345: 0.0077, 350: 0.0258, 351: 0.0177, 352: 0.0005, 353: 0.001, 356: 0.002, 360: 0.0027, 369: 0.0006, 370: 0.0033, 378: 0.0207, 379: 0.0217, 380: 0.0015, 381: 0.0006, 385: 0.0178, 386: 0.0053, 400: 0.0072, 401: 0.0062, 402: 0.0016, 405: 0.0244, 406: 0.0691, 407: 0.0214, 408: 0.0005, 412: 0.0021, 413: 0.0018, 414: 0.0241, 415: 0.0076, 416: 0.005, 428: 0.0017, 429: 0.0106, 433: 0.0215, 434: 0.053, 435: 0.0523, 436: 0.0006, 438: 0.0006, 440: 0.0216, 453: 0.0011, 454: 0.0086, 455: 0.0003, 457: 0.0011,

## General process to any model
1. Create an instance
2. Fit into training data
3. Transform the testing data OR if we have test data or some data that doesn't have the labels already we would just evaluate it 
4. We get our predictions out
5. Then we can call our evaluation.