<a href="https://colab.research.google.com/github/nandini-mazumdar/learning-spark-again/blob/main/Intro_to_Spark_Chap_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup Spark Context

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

##Dataframe API

###Reading Data

In [None]:
df = spark.read.json('/content/drive/MyDrive/Colab Notebooks/LinkedIn_Learining_Intro_to_Spark_Data/utilization.json')
df.show(10)

KeyboardInterrupt: ignored

In [None]:
df.count()

###Summary Stats

In [None]:
#Summary Stats

df.describe().show()

In [None]:
df.stat.corr('cpu_utilization','free_memory')

In [None]:
df.stat.corr('session_count','free_memory')

In [None]:
df.stat.corr('session_count','cpu_utilization')

In [None]:
df.stat.freqItems(['server_id','session_count']).show()

In [None]:
df_samp = df.sample(fraction=0.05, withReplacement=False)
df_samp.count()

#Spark SQL

##Reading Data

In [None]:
df.createOrReplaceTempView("utilization")

In [None]:
spark.sql('Select min(cpu_utilization), max(cpu_utilization), stddev(cpu_utilization) \
            From utilization').show()

+--------------------+--------------------+-----------------------+
|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+--------------------+--------------------+-----------------------+
|                0.22|                 1.0|     0.1587517387291305|
+--------------------+--------------------+-----------------------+



##Summary Stats

In [None]:
spark.sql('Select server_id, \
            min(cpu_utilization), \
            max(cpu_utilization), \
            round(stddev(cpu_utilization),2) \
            From utilization \
            Group By server_id').show()

+---------+--------------------+--------------------+---------------------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|round(stddev(cpu_utilization), 2)|
+---------+--------------------+--------------------+---------------------------------+
|      112|                0.52|                0.92|                             0.12|
|      113|                0.58|                0.98|                             0.12|
|      130|                0.35|                0.75|                             0.12|
|      126|                0.48|                0.88|                             0.12|
|      149|                0.54|                0.94|                             0.12|
|      110|                0.35|                0.75|                             0.12|
|      136|                0.41|                 0.8|                             0.12|
|      144|                0.47|                0.87|                             0.11|
|      119|                0.22|

##Histograms or Binning

In [None]:
spark.sql('Select server_id, floor(cpu_utilization*100/10) bucket From Utilization').show()

+---------+------+
|server_id|bucket|
+---------+------+
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     4|
|      100|     5|
|      100|     4|
|      100|     5|
|      100|     5|
|      100|     3|
|      100|     6|
|      100|     6|
|      100|     5|
|      100|     2|
|      100|     4|
|      100|     4|
|      100|     6|
|      100|     4|
|      100|     5|
+---------+------+
only showing top 20 rows



In [None]:
spark.sql('Select floor(cpu_utilization*100/10) bucket, count(*) From Utilization Group by bucket Order by bucket').show()

+------+--------+
|bucket|count(1)|
+------+--------+
|     2|    8186|
|     3|   37029|
|     4|   68046|
|     5|  104910|
|     6|  116725|
|     7|   88242|
|     8|   56598|
|     9|   20207|
|    10|      57|
+------+--------+



#Time Series Analysis : Windowing

In [None]:
spark.sql('Select server_id, \
            min(cpu_utilization), \
            max(cpu_utilization), \
            stddev(cpu_utilization) \
            From Utilization Group by server_id').show()

+---------+--------------------+--------------------+-----------------------+
|server_id|min(cpu_utilization)|max(cpu_utilization)|stddev(cpu_utilization)|
+---------+--------------------+--------------------+-----------------------+
|      112|                0.52|                0.92|    0.11528867845082576|
|      113|                0.58|                0.98|    0.11544345150353694|
|      130|                0.35|                0.75|    0.11568834774245991|
|      126|                0.48|                0.88|    0.11542612970702058|
|      149|                0.54|                0.94|    0.11543517500295467|
|      110|                0.35|                0.75|    0.11533251724450215|
|      136|                0.41|                 0.8|    0.11597405743182258|
|      144|                0.47|                0.87|    0.11478654960489501|
|      119|                0.22|                0.62|    0.11516031929842008|
|      116|                 0.3|                 0.7|    0.11506

##Average

In [None]:
spark.sql('Select event_datetime, server_id, cpu_utilization, \
            avg(cpu_utilization) Over (Partition By server_id) as avg_server_util \
            From utilization').show()

+-------------------+---------+---------------+------------------+
|     event_datetime|server_id|cpu_utilization|   avg_server_util|
+-------------------+---------+---------------+------------------+
|03/05/2019 08:06:34|      112|           0.71|0.7153870000000067|
|03/05/2019 08:11:34|      112|           0.78|0.7153870000000067|
|03/05/2019 08:16:34|      112|           0.87|0.7153870000000067|
|03/05/2019 08:21:34|      112|           0.82|0.7153870000000067|
|03/05/2019 08:26:34|      112|           0.62|0.7153870000000067|
|03/05/2019 08:31:34|      112|            0.9|0.7153870000000067|
|03/05/2019 08:36:34|      112|           0.89|0.7153870000000067|
|03/05/2019 08:41:34|      112|           0.81|0.7153870000000067|
|03/05/2019 08:46:34|      112|           0.88|0.7153870000000067|
|03/05/2019 08:51:34|      112|           0.89|0.7153870000000067|
|03/05/2019 08:56:34|      112|           0.84|0.7153870000000067|
|03/05/2019 09:01:34|      112|           0.71|0.7153870000000

##Delta Difference

In [None]:
spark.sql('Select event_datetime, server_id, cpu_utilization, \
            round(avg(cpu_utilization) Over (Partition By server_id),2) as avg_server_util, \
            round(cpu_utilization - avg(cpu_utilization) Over (Partition By server_id),3) as delta_server_util \
            From utilization').show()

+-------------------+---------+---------------+---------------+-----------------+
|     event_datetime|server_id|cpu_utilization|avg_server_util|delta_server_util|
+-------------------+---------+---------------+---------------+-----------------+
|03/05/2019 08:06:34|      112|           0.71|           0.72|           -0.005|
|03/05/2019 08:11:34|      112|           0.78|           0.72|            0.065|
|03/05/2019 08:16:34|      112|           0.87|           0.72|            0.155|
|03/05/2019 08:21:34|      112|           0.82|           0.72|            0.105|
|03/05/2019 08:26:34|      112|           0.62|           0.72|           -0.095|
|03/05/2019 08:31:34|      112|            0.9|           0.72|            0.185|
|03/05/2019 08:36:34|      112|           0.89|           0.72|            0.175|
|03/05/2019 08:41:34|      112|           0.81|           0.72|            0.095|
|03/05/2019 08:46:34|      112|           0.88|           0.72|            0.165|
|03/05/2019 08:5

##Sliding Window

In [None]:
spark.sql('Select event_datetime, server_id, cpu_utilization, \
            round(avg(cpu_utilization) Over (Partition By server_id Order by event_datetime \
                                        ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING),3) AS avg_server_util \
            From utilization').show()

+-------------------+---------+---------------+---------------+
|     event_datetime|server_id|cpu_utilization|avg_server_util|
+-------------------+---------+---------------+---------------+
|03/05/2019 08:06:34|      112|           0.71|          0.745|
|03/05/2019 08:11:34|      112|           0.78|          0.787|
|03/05/2019 08:16:34|      112|           0.87|          0.823|
|03/05/2019 08:21:34|      112|           0.82|           0.77|
|03/05/2019 08:26:34|      112|           0.62|           0.78|
|03/05/2019 08:31:34|      112|            0.9|          0.803|
|03/05/2019 08:36:34|      112|           0.89|          0.867|
|03/05/2019 08:41:34|      112|           0.81|           0.86|
|03/05/2019 08:46:34|      112|           0.88|           0.86|
|03/05/2019 08:51:34|      112|           0.89|           0.87|
|03/05/2019 08:56:34|      112|           0.84|          0.813|
|03/05/2019 09:01:34|      112|           0.71|            0.8|
|03/05/2019 09:06:34|      112|         

#Machine Learning

## Clustering

###K-Means Clustering

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [None]:
cluster_df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn_Learining_Intro_to_Spark_Data/clustering_dataset.csv',header=True,inferSchema=True)
cluster_df.show(10)

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   7|   4|   1|
|   7|   7|   9|
|   7|   9|   6|
|   1|   6|   5|
|   6|   7|   7|
|   7|   9|   4|
|   7|  10|   6|
|   7|   8|   2|
|   8|   3|   8|
|   4|  10|   5|
+----+----+----+
only showing top 10 rows



In [None]:
# Create a vector
vecAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'], outputCol="features")

In [None]:
# Apply transformation
vcluster_df = vecAssembler.transform(cluster_df)
vcluster_df.show()

+----+----+----+--------------+
|col1|col2|col3|      features|
+----+----+----+--------------+
|   7|   4|   1| [7.0,4.0,1.0]|
|   7|   7|   9| [7.0,7.0,9.0]|
|   7|   9|   6| [7.0,9.0,6.0]|
|   1|   6|   5| [1.0,6.0,5.0]|
|   6|   7|   7| [6.0,7.0,7.0]|
|   7|   9|   4| [7.0,9.0,4.0]|
|   7|  10|   6|[7.0,10.0,6.0]|
|   7|   8|   2| [7.0,8.0,2.0]|
|   8|   3|   8| [8.0,3.0,8.0]|
|   4|  10|   5|[4.0,10.0,5.0]|
|   7|   4|   5| [7.0,4.0,5.0]|
|   7|   8|   4| [7.0,8.0,4.0]|
|   2|   5|   1| [2.0,5.0,1.0]|
|   2|   6|   2| [2.0,6.0,2.0]|
|   2|   3|   8| [2.0,3.0,8.0]|
|   3|   9|   1| [3.0,9.0,1.0]|
|   4|   2|   9| [4.0,2.0,9.0]|
|   1|   7|   1| [1.0,7.0,1.0]|
|   6|   2|   3| [6.0,2.0,3.0]|
|   4|   1|   9| [4.0,1.0,9.0]|
+----+----+----+--------------+
only showing top 20 rows



In [None]:
# create K-means object
kmeans_obj = KMeans().setK(3).setSeed(1)

In [None]:
# create K-means model
kmodel = kmeans_obj.fit(vcluster_df)

In [None]:
# check cluster centers
centers = kmodel.clusterCenters()
centers

[array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667]),
 array([5.12, 5.84, 4.84])]

###Hierarchical Clustering

In [None]:
vcluster_df

col1,col2,col3,features
7,4,1,"[7.0,4.0,1.0]"
7,7,9,"[7.0,7.0,9.0]"
7,9,6,"[7.0,9.0,6.0]"
1,6,5,"[1.0,6.0,5.0]"
6,7,7,"[6.0,7.0,7.0]"
7,9,4,"[7.0,9.0,4.0]"
7,10,6,"[7.0,10.0,6.0]"
7,8,2,"[7.0,8.0,2.0]"
8,3,8,"[8.0,3.0,8.0]"
4,10,5,"[4.0,10.0,5.0]"


In [None]:
from pyspark.ml.clustering import BisectingKMeans

In [None]:
bk_means = BisectingKMeans().setK(3).setSeed(1)

In [None]:
# create model
bk_model = bk_means.fit(vcluster_df)

In [None]:
# check centers
bk_centers = bk_model.clusterCenters()

In [None]:
bk_centers

[array([5.12, 5.84, 4.84]),
 array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667])]

In [None]:
centers

[array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667]),
 array([5.12, 5.84, 4.84])]

In [None]:
#Difference between features and each clusterCenter

##Classification

### Naive Bayes Classifier
#### When variables not tightly correlated, i.e independent features

In [None]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [None]:
iris_df = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn_Learining_Intro_to_Spark_Data/iris.data',inferSchema=True)
iris_df.take(1)

[Row(_c0=5.1, _c1=3.5, _c2=1.4, _c3=0.2, _c4='Iris-setosa')]

In [None]:
iris_df = iris_df.select(col("_c0").alias("sepal_len"),col("_c1").alias("sepal_width"),col("_c2").alias("petal_len"), col("_c3").alias("petal_width"),col("_c4").alias("species"))
iris_df.take(2)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa'),
 Row(sepal_len=4.9, sepal_width=3.0, petal_len=1.4, petal_width=0.2, species='Iris-setosa')]

In [None]:
#transform to vector struct
v_asm = VectorAssembler(inputCols=["sepal_len","sepal_width","petal_len","petal_width"], outputCol="features")

In [None]:
vec_iris = v_asm.transform(iris_df)
vec_iris.take(2)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2])),
 Row(sepal_len=4.9, sepal_width=3.0, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([4.9, 3.0, 1.4, 0.2]))]

In [None]:
#convert label name to numeric
ind = StringIndexer(inputCol="species",outputCol="label")

#df to capture these indexed labels
ind_vec_iris = ind.fit(vec_iris).transform(vec_iris)
ind_vec_iris.take(1)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

In [None]:
ind_vec_iris.show(3)

+---------+-----------+---------+-----------+-----------+-----------------+-----+
|sepal_len|sepal_width|petal_len|petal_width|    species|         features|label|
+---------+-----------+---------+-----------+-----------+-----------------+-----+
|      5.1|        3.5|      1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|      4.9|        3.0|      1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|      4.7|        3.2|      1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
+---------+-----------+---------+-----------+-----------+-----------------+-----+
only showing top 3 rows



In [None]:
ind_vec_iris.show(4)

+---------+-----------+---------+-----------+-----------+-----------------+-----+
|sepal_len|sepal_width|petal_len|petal_width|    species|         features|label|
+---------+-----------+---------+-----------+-----------+-----------------+-----+
|      5.1|        3.5|      1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|      4.9|        3.0|      1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|      4.7|        3.2|      1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|      4.6|        3.1|      1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
+---------+-----------+---------+-----------+-----------+-----------------+-----+
only showing top 4 rows



In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
splits = ind_vec_iris.randomSplit([.6,.4],seed=1)

In [None]:
train_df = splits[0]

In [None]:
test_df = splits[1]

In [None]:
train_df.count()

98

In [None]:
test_df.count()

52

In [None]:
ind_vec_iris.count()

150

In [None]:
# create a Naive Bayes classifier
nb = NaiveBayes(modelType="multinomial")
nb_model = nb.fit(train_df)

In [None]:
# create predictions
pred_df = nb_model.transform(test_df)
pred_df.take(1)

[Row(sepal_len=4.3, sepal_width=3.0, petal_len=1.1, petal_width=0.1, species='Iris-setosa', features=DenseVector([4.3, 3.0, 1.1, 0.1]), label=0.0, rawPrediction=DenseVector([-9.9894, -11.3476, -11.902]), probability=DenseVector([0.7118, 0.183, 0.1051]), prediction=0.0)]

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
nb_acc = evaluator.evaluate(pred_df)

In [None]:
nb_acc

0.9807692307692307

###Multilayer Perceptron Classifier
#### Non-linear relationship between dependent and independent variables

In [None]:
ind_vec_iris.take(1)

[Row(sepal_len=5.1, sepal_width=3.5, petal_len=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

In [None]:
ind_vec_iris.schema

StructType(List(StructField(sepal_len,DoubleType,true),StructField(sepal_width,DoubleType,true),StructField(petal_len,DoubleType,true),StructField(petal_width,DoubleType,true),StructField(species,StringType,true),StructField(features,VectorUDT,true),StructField(label,DoubleType,false)))

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [None]:
# first level == number of dimensions (input features)
layers = [4, 5, 5, 3]

In [None]:
mlp = MultilayerPerceptronClassifier(layers=layers, seed=1)
mlp_model = mlp.fit(train_df)
mlp_pred = mlp_model.transform(test_df)

In [None]:
mlp_eval = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_acc = mlp_eval.evaluate(mlp_pred)
mlp_acc

0.6923076923076923

### Decision Tree Classifier

In [None]:
ind_vec_iris.dtypes

[('sepal_len', 'double'),
 ('sepal_width', 'double'),
 ('petal_len', 'double'),
 ('petal_width', 'double'),
 ('species', 'string'),
 ('features', 'vector'),
 ('label', 'double')]

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
dt_obj = DecisionTreeClassifier(labelCol="label",featuresCol="features")
dt_model = dt_obj.fit(train_df)
dt_pred = dt_model.transform(test_df)

In [None]:
dt_eval = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
dt_acc = dt_eval.evaluate(dt_pred)

In [None]:
dt_acc

0.9423076923076923

##Regression

###Linear Regression

In [None]:
ccpp = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/LinkedIn_Learining_Intro_to_Spark_Data/Folds5x2_pp.csv',inferSchema=True,header=True)
ccpp.dtypes

[('AT', 'double'),
 ('V', 'double'),
 ('AP', 'double'),
 ('RH', 'double'),
 ('PE', 'double')]

In [None]:
ccpp.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26)]

In [None]:
ccpp1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ccpp1.columns]
   ).show()

+---+---+---+---+---+
| AT|  V| AP| RH| PE|
+---+---+---+---+---+
|  0|  0|  0|  0|  0|
+---+---+---+---+---+



In [None]:
#ccpp.na.fill(value=-999).show()
ccpp1 = ccpp.fillna(-999)
ccpp1.show()

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
|26.27|59.44|1012.23|58.77|443.67|
|15.89|43.96|1014.02|75.24|467.35|
| 9.48|44.71|1019.12|66.43|478.42|
|14.64| 45.0|1021.78|41.25|475.98|
|11.74|43.56|1015.14|70.72| 477.5|
|17.99|43.72|1008.64|75.04|453.02|
|20.14|46.93|1014.66|64.22|453.99|
|24.34| 73.5|1011.31|84.15|440.29|
|25.71|58.59|1012.77|61.83|451.28|
|26.19|69.34|1009.48|87.59|433.99|
|21.42|43.79|1015.76|43.08|462.19|
|18.21| 45.0|1022.86|48.84|467.54|
|11.04|41.74| 1022.6|77.51| 477.2|
|14.45|52.75|1023.97|63.59|459.85|
|13.97|38.47|1015.15|55.28| 464.3|
+-----+-----+-------+-----+------+
only showing top 20 rows



In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col,isnan, when, count

In [None]:
vec_asm = VectorAssembler(inputCols=["AT","V","AP","RH"], outputCol="features")
vec_pp_df = vec_asm.transform(ccpp1)
vec_pp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [None]:
vec_pp_df.count()

47844

In [None]:
vec_pp_df.dtypes

[('AT', 'double'),
 ('V', 'double'),
 ('AP', 'double'),
 ('RH', 'double'),
 ('PE', 'double'),
 ('features', 'vector')]

In [None]:
lin_reg = LinearRegression(featuresCol="features",labelCol="PE")
lr_model = lin_reg.fit(vec_pp_df)

In [None]:
lr_model.coefficients

DenseVector([-0.6037, -0.5266, 1.1661, 0.1478])

In [None]:
lr_model.intercept

-697.5963706368208

In [None]:
lr_model.summary.rootMeanSquaredError

7.605251517642699

In [None]:
#save model
lr_model.save("lr1_cpp.model")

###Decision Tree Regression

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col,isnan, when, count

In [None]:
ccpp1.dtypes

[('AT', 'double'),
 ('V', 'double'),
 ('AP', 'double'),
 ('RH', 'double'),
 ('PE', 'double')]

In [None]:
ccpp1.count()

47844

In [None]:
ccpp1.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ccpp1.columns]
   ).show()

+---+---+---+---+---+
| AT|  V| AP| RH| PE|
+---+---+---+---+---+
|  0|  0|  0|  0|  0|
+---+---+---+---+---+



In [None]:
ccpp1.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26)]

In [None]:
vec_asm_dtreg = VectorAssembler(inputCols=["AT","V","AP","RH"], outputCol="features")

In [None]:
#create df with vectorized power plant data
vpp_df = vec_asm_dtreg.transform(ccpp1)
vpp_df.dtypes

[('AT', 'double'),
 ('V', 'double'),
 ('AP', 'double'),
 ('RH', 'double'),
 ('PE', 'double'),
 ('features', 'vector')]

In [None]:
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [None]:
#split data into train and test
splits = vpp_df.randomSplit(weights=[0.7, 0.3])

In [None]:
train_df = splits[0]
test_df = splits[1]

In [None]:
train_df.count()

33589

In [None]:
test_df.count()

14255

In [None]:
vpp_df.count()

47844

In [None]:
# create Decision tree object
dt_reg = DecisionTreeRegressor(featuresCol="features", labelCol="PE")
dt_reg_model = dt_reg.fit(train_df)

In [None]:
dt_reg_pred = dt_reg_model.transform(test_df)
dt_reg_pred.take(3)

[Row(AT=-999.0, V=-999.0, AP=-999.0, RH=-999.0, PE=-999.0, features=DenseVector([-999.0, -999.0, -999.0, -999.0]), prediction=-999.0),
 Row(AT=-999.0, V=-999.0, AP=-999.0, RH=-999.0, PE=-999.0, features=DenseVector([-999.0, -999.0, -999.0, -999.0]), prediction=-999.0),
 Row(AT=1.81, V=39.42, AP=1026.92, RH=76.97, PE=490.55, features=DenseVector([1.81, 39.42, 1026.92, 76.97]), prediction=485.9494732061756)]

In [None]:
dt_evaluator = RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")

In [None]:
rmse_val = dt_evaluator.evaluate(dt_reg_pred)
rmse_val

4.523204515709961

###Gradient Boosting Regression Tree

In [None]:
from pyspark.ml.regression import GBTRegressor

In [None]:
gbt = GBTRegressor(featuresCol="features",labelCol="PE")

In [None]:
gbt_model = gbt.fit(train_df)

In [None]:
gbt_pred = gbt_model.transform(test_df)

In [None]:
# can reuse dt_evaluator
gbt_eval = RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")

In [None]:
rmse_gbt = gbt_eval.evaluate(gbt_pred)
rmse_gbt

3.9288721099324726

##Recommendation Systems

###Collaborative Filtering:







Alternating Least Squares --- build a dataframe of User-Item matrix

Modelling ALS object:

*   UserCol
*   itemCol
*   ratingCol

Evaluate:

*   Create predictions on test_data
*   Use RegressionEvaluator
*   Use rmse metricName


###*(incomplete)*

In [None]:
# vector_Assembler = VectorAssembler(inputCols=["cpu_utilization"],outputCol="features")

In [None]:
# vlinear_df = vector_Assembler.transform(df)
# vlinear_df.show()

In [None]:
# linreg_model = LinearRegression(featuresCol="features",labelCol="session_count")

In [None]:
# lrmodel = linreg_model.fit(vlinear_df)

In [None]:
# lrmodel.coefficients

In [None]:
# lrmodel.intercept

In [None]:
# RMSE

# lrmodel.summary.rootMeanSquaredError