<a href="https://colab.research.google.com/github/manojmanivannan/ApacheSparkEssentials/blob/master/ML_SparkByPluralsight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [112]:
!apt-get update
!apt-get install -y openjdk-8-jdk-headless scala jq  > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
!pip install -q pyspark findspark

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [2 InRelease 14.2 kB/88.0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.39                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 51.8 kB/88.7 k                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-le

In [113]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [114]:
!python --version

Python 3.8.15


In [115]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# Spark session & context
conf = SparkConf().set('spark.ui.port', '4050')
try:
  sc = SparkContext(conf=conf)
except ValueError:
  sc.stop()
  sc = SparkContext(conf=conf)

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [116]:
sc

In [117]:
# create new token from ngrok
get_ipython().system_raw('./ngrok authtoken 2IaHtxKm7l0XqBXP6s9xWSyVHqE_4wYXtkZorquhudnHW23ci')
# revoke the token once you close the notebook
get_ipython().system_raw('./ngrok http 4050 &')

In [118]:
!sleep 10 && curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://e030-34-123-69-186.ngrok.io


In [119]:
# Dataset
# https://goo.gl/pSKLMJ or https://archive.ics.uci.edu/ml/datasets/wine


In [120]:
rawData = sc.textFile('wine.data')

In [121]:
rawData.take(5)

['1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065',
 '1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050',
 '1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185',
 '1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480',
 '1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735']

In [122]:
# Column # 
# 0) - Wine maker 1,2 or 3
# 1) Alcohol
# 2) Malic acid
# 3) Ash
# 4) Alcalinity of ash
# 5) Magnesium
# 6) Total phenols
# 7) Flavanoids
# 8) Nonflavanoid phenols
# 9) Proanthocyanins
# 10)Color intensity
# 11)Hue
# 12)OD280/OD315 of diluted wines
# 13)Proline

# Using the older Spark mllib library

In [123]:
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
  values = [float(x) for x in line.split(',')]
  return LabeledPoint(values[0],values[1:])


In [124]:
parsedData = rawData.map(parsePoint)
parsedData.take(5)

[LabeledPoint(1.0, [14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]),
 LabeledPoint(1.0, [13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0]),
 LabeledPoint(1.0, [13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0]),
 LabeledPoint(1.0, [14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0]),
 LabeledPoint(1.0, [13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0])]

In [125]:
(trainingData, testData) = parsedData.randomSplit([0.7,0.3])

In [126]:
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=4, # there are classes 1,2,3 as wine makers , so should 1+greatest value
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=3,
                                     maxBins=32)

In [127]:
predictions = model.predict(testData.map(lambda x: x.features))
predictions.take(5)

[1.0, 1.0, 1.0, 1.0, 1.0]

In [128]:
# compare predictions with actual labels
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)
labelsAndPredictions.take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0)]

In [129]:
testAcc = labelsAndPredictions\
                .filter(lambda x: x[0] == x[1])\
                .count() / float(testData.count())
print('Test Accuracy = ', testAcc)                

Test Accuracy =  0.9583333333333334


In [130]:
# we have a built-in function for evaluating metrics
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(labelsAndPredictions)

In [131]:
metrics.accuracy

0.9583333333333334

In [132]:
metrics.precision(1.0)

1.0

In [133]:
metrics.precision(3.0)

1.0

In [134]:
metrics.confusionMatrix().toArray()

array([[21.,  1.,  0.],
       [ 0., 12.,  0.],
       [ 0.,  1., 13.]])

In [135]:
# to view the decision tree
print(model.toDebugString())

DecisionTreeModel classifier of depth 3 with 9 nodes
  If (feature 9 <= 3.3899999999999997)
   Predict: 2.0
  Else (feature 9 > 3.3899999999999997)
   If (feature 6 <= 1.5950000000000002)
    If (feature 3 <= 17.05)
     Predict: 2.0
    Else (feature 3 > 17.05)
     Predict: 3.0
   Else (feature 6 > 1.5950000000000002)
    If (feature 12 <= 679.0)
     Predict: 2.0
    Else (feature 12 > 679.0)
     Predict: 1.0



# LIBSVM data format

Each row in a dataset, has a label and features.
Features are referred as index.
i.e, index1 => feature1
so a record is represented as

| label | index1:value1 | index2:value2 | index3:value3 | ... |
|-------|---------------|---------------|---------------|-----|
| 0     | 1:324         | 2:332         | 3:12          | ... |
| 1     | 1:455         | 2:213         | 3:85          | ... |
| 0     | 1:344         | 2:165         |               | ... |

missing feature can be represented by simply omitting them.


In [136]:
from pyspark.mllib.util import MLUtils
# dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale
libsvmData = MLUtils.loadLibSVMFile(sc, 'wine.scale')

In [137]:
libsvmData.take(5)

[LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.68421,-0.616601,0.144385,-0.484536,0.23913,0.255172,0.147679,-0.433962,0.18612,-0.255973,-0.089431,0.941392,0.122682])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.142105,-0.588933,-0.165775,-0.938144,-0.347826,0.151724,0.0210971,-0.509434,-0.451104,-0.47099,-0.0731708,0.56044,0.101284])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.121053,-0.359684,0.40107,-0.175258,-0.326087,0.255172,0.223629,-0.358491,0.514196,-0.249147,-0.105691,0.391941,0.293866])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.757895,-0.521739,0.219251,-0.360825,-0.0652174,0.97931,0.329114,-0.584906,0.116719,0.112628,-0.382114,0.59707,0.714693])),
 LabeledPoint(1.0, (13,[0,1,2,3,4,5,6,7,8,9,10,11,12],[0.163158,-0.268775,0.614973,0.0721649,0.0434783,0.255172,-0.00843878,-0.018868,-0.11041,-0.481229,-0.089431,0.216117,-0.348074]))]

In [138]:
(trainingData, testData) = libsvmData.randomSplit([0.8,0.2])

In [139]:
libsvmmodel = DecisionTree.trainClassifier(trainingData,
                                     numClasses=4, # there are classes 1,2,3 as wine makers , so should 1+greatest value
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

In [140]:
predictions = libsvmmodel.predict(testData.map(lambda x: x.features))

In [141]:
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)
labelsAndPredictions.take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 2.0), (1.0, 1.0), (1.0, 1.0)]

In [142]:
metrics = MulticlassMetrics(labelsAndPredictions)

In [143]:
metrics.accuracy

0.9459459459459459

In [144]:
metrics.confusionMatrix().toArray()

array([[ 9.,  0.,  0.],
       [ 1., 11.,  0.],
       [ 0.,  1., 15.]])

In [145]:
print(model.toDebugString())

DecisionTreeModel classifier of depth 3 with 9 nodes
  If (feature 9 <= 3.3899999999999997)
   Predict: 2.0
  Else (feature 9 > 3.3899999999999997)
   If (feature 6 <= 1.5950000000000002)
    If (feature 3 <= 17.05)
     Predict: 2.0
    Else (feature 3 > 17.05)
     Predict: 3.0
   Else (feature 6 > 1.5950000000000002)
    If (feature 12 <= 679.0)
     Predict: 2.0
    Else (feature 12 > 679.0)
     Predict: 1.0



# Using the newer Spark ml library