<a href="https://colab.research.google.com/github/mridul-eecs/signal-processing-apachesparkml-apachesystemml/blob/master/HyperparameterTuningGridSearch_sparkML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocessing to work with spark

In [1]:

# spark dependencies:
# citation: http://medium.com/@rmache/big-data-with-spark-in-google-colab-7c046e24b3
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apachemirror.wuchna.com/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz

!pip install -q findspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 51kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 47.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130387 sha256=ca8ba23a9d9a57631985373676095332dae7e7104682d09bd5d205886084834c
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4


In [2]:
!git clone https://github.com/mridul-eecs/signal-processing-apachesparkml-apachesystemml.git

Cloning into 'signal-processing-apachesparkml-apachesystemml'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 37 (delta 9), reused 16 (delta 2), pack-reused 0[K
Unpacking objects: 100% (37/37), done.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType
from pyspark.sql.functions import lit
import os
from tqdm import tqdm_notebook as tqdm

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"


import findspark
findspark.init()

APP_NAME= "Human Motion Premitives"
SPARK_URL= "local[*]"
RANDOM_SEED = 141109
TRAINING_DATA_RATIO = 0.7
RF_NUM_TREES = 8
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [0]:
spark= SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

### Main program

In [0]:
df= spark.read.parquet('/content/signal-processing-apachesparkml-apachesystemml/df.parquet')
df= df.createOrReplaceTempView('df')

In [11]:
## As linear SVC supports binary class only, dropping all classes except 2
# print(df.count())
classes= spark.sql("""select df.class from df""")
classcounts= spark.sql("""select class, count(*) from df group by class""")
class_list= classes.distinct().collect()
class_list= [i[0] for i in class_list]
print(class_list)

newdf= spark.sql("""select * from df where df.class= 'Use_telephone' or df.class = 'Standup_chair'""")
newdf.show()

['Use_telephone', 'Standup_chair', 'Eat_meat', 'Getup_bed', 'Drink_glass', 'Pour_water', 'Comb_hair', 'Walk', 'Climb_stairs', 'Sitdown_chair', 'Liedown_bed', 'Descend_stairs', 'Brush_teeth', 'Eat_soup']
+---+---+---+-------------+
|  x|  y|  z|        class|
+---+---+---+-------------+
| 14| 46| 31|Standup_chair|
| 49| 24| 40|Use_telephone|
|  7| 30| 17|Standup_chair|
| 16| 41| 44|Standup_chair|
| 34| 43| 44|Use_telephone|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 14| 40| 33|Standup_chair|
| 44| 31| 50|Use_telephone|
| 44| 31| 50|Use_telephone|
| 12| 30| 33|Standup_chair|
| 29| 41| 51|Standup_chair|
| 29| 41| 51|Standup_chair|
| 29| 41| 51|Standup_chair|
| 29| 41| 51|Standup_chair|
| 25| 36| 44|Use_telephone|
+---+---+---+-------------+
only showing top 20 rows



In [0]:
# df= df.createOrReplaceTempView('df')
train_df, test_df= newdf.randomSplit([0.8, 0.2])

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

from pyspark.ml.classification import LinearSVC

In [0]:
indexer= StringIndexer(inputCol= 'class', outputCol= 'label')
vectorizer= VectorAssembler(inputCols= ['x', 'y', 'z'], outputCol= 'feat')
normalizer= Normalizer(inputCol= 'feat', outputCol= 'features', p=1.0) 


svmc= LinearSVC(maxIter=100, regParam= 0.1)
pipe= Pipeline(stages= [indexer, vectorizer, normalizer, svmc])#, svmc])
model= pipe.fit(train_df)
predx= model.transform(train_df)

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator= BinaryClassificationEvaluator(rawPredictionCol= 'rawPrediction')
evaluator.evaluate(predx)

0.9437265327007998

In [16]:
predy= model.transform(test_df)
evaluator.evaluate(predy)

0.9441254175358915

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
params= ParamGridBuilder().addGrid(svmc.regParam, [0.1, 0.01, 1]).build()
crossval= CrossValidator(estimator= pipe,
                        estimatorParamMaps= params,
                        evaluator= BinaryClassificationEvaluator(),
                        numFolds= 4,
                        )

In [0]:
cvModel= crossval.fit(train_df)

In [23]:
predx= cvModel.transform(train_df)
evaluator.evaluate(predx)

0.9441569432465926

In [24]:
predy= cvModel.transform(test_df)
evaluator.evaluate(predy)

0.944655982767246

In [50]:
import numpy as np
cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]


{Param(parent='LinearSVC_96bf344fc84d', name='regParam', doc='regularization parameter (>= 0).'): 0.01}