In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{PCA,StandardScaler,VectorAssembler}
import org.apache.spark.ml.linalg.Vectors

val spark = SparkSession.builder().appName("PCA_Example").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://MSI:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1605307422953)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{PCA, StandardScaler, VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6b8c244b


In [2]:
val df = spark.read.option("header","true").option("inferSchema","true").format("csv").load("../../data/ml_scala/Cancer_Data")

df: org.apache.spark.sql.DataFrame = [mean radius: int, mean texture: double ... 28 more fields]


In [3]:
df.printSchema()

root
 |-- mean radius: integer (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst

In [4]:
val colnames = (Array("mean radius", "mean texture", "mean perimeter", "mean area", "mean smoothness",
"mean compactness", "mean concavity", "mean concave points", "mean symmetry", "mean fractal dimension",
"radius error", "texture error", "perimeter error", "area error", "smoothness error", "compactness error",
"concavity error", "concave points error", "symmetry error", "fractal dimension error", "worst radius",
"worst texture", "worst perimeter", "worst area", "worst smoothness", "worst compactness", "worst concavity",
"worst concave points", "worst symmetry", "worst fractal dimension"))

colnames: Array[String] = Array(mean radius, mean texture, mean perimeter, mean area, mean smoothness, mean compactness, mean concavity, mean concave points, mean symmetry, mean fractal dimension, radius error, texture error, perimeter error, area error, smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst radius, worst texture, worst perimeter, worst area, worst smoothness, worst compactness, worst concavity, worst concave points, worst symmetry, worst fractal dimension)


In [5]:
val assembler = new VectorAssembler().setInputCols(colnames).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_87668d021cf6, handleInvalid=error, numInputCols=30


In [6]:
val output = assembler.transform(df).select($"features")

output: org.apache.spark.sql.DataFrame = [features: vector]


In [7]:
// scaledFeatures
val scaler = (new StandardScaler()
  .setInputCol("features")
  .setOutputCol("scaledFeatures")
  .setWithStd(true)
  .setWithMean(false))

// Standardize Features
val scalerModel = scaler.fit(output)

val scaledData = scalerModel.transform(output)

scaledData.show(5)

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[0.0,17.99,10.38,...|[0.0,5.1049235941...|
|[1.0,20.57,17.77,...|[0.00608270930682...|
|[2.0,19.69,21.25,...|[0.01216541861364...|
|[3.0,11.42,20.38,...|[0.01824812792047...|
|[4.0,20.29,14.34,...|[0.02433083722729...|
+--------------------+--------------------+
only showing top 5 rows



scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_5254a6b3ac1a
scalerModel: org.apache.spark.ml.feature.StandardScalerModel = StandardScalerModel: uid=stdScal_5254a6b3ac1a, numFeatures=30, withMean=false, withStd=true
scaledData: org.apache.spark.sql.DataFrame = [features: vector, scaledFeatures: vector]


In [8]:
// Use PCA to use only four Principal components
val pca = (new PCA()
  .setInputCol("scaledFeatures")
  .setOutputCol("pcaFeatures")
  .setK(4)
  .fit(scaledData))

val pcaDF = pca.transform(scaledData)

val result = pcaDF.select("pcaFeatures")
result.show(5)
result.head(1)

+--------------------+
|         pcaFeatures|
+--------------------+
|[21.6219973823649...|
|[15.1217370347583...|
|[18.4325856097778...|
|[18.9549565028940...|
|[16.7333072691963...|
+--------------------+
only showing top 5 rows



pca: org.apache.spark.ml.feature.PCAModel = PCAModel: uid=pca_ad8b7eeecd91, k=4
pcaDF: org.apache.spark.sql.DataFrame = [features: vector, scaledFeatures: vector ... 1 more field]
result: org.apache.spark.sql.DataFrame = [pcaFeatures: vector]
res2: Array[org.apache.spark.sql.Row] = Array([[21.62199738236499,8.516595739466716,-3.731847417579699,-0.41812449701477833]])


In [9]:
spark.stop()