# Standardization
One of the most popular techniques for scaling numerical data prior to modeling is standardization.

Standardizing a dataset involves rescaling the distribution of values so that the mean of observed values (as a feature) is 0.00 and the standard deviation is 1.00.

In general, many machine learning algorithms perform better — you can build a better realistic model — when numerical input variables — also called features — are scaled to a standard range.


A value is standardized as follows:

$ y = (x – mean) / {std} $

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("pyspark-ml-standardization").master("local[*]").getOrCreate()

In [3]:
features = [('alex', 1), ('bob', 3), ('ali', 6), ('dave', 10)]
columns = ("name", "age")
samples = spark.createDataFrame(features, columns)

In [4]:
samples.show()

+----+---+
|name|age|
+----+---+
|alex|  1|
| bob|  3|
| ali|  6|
|dave| 10|
+----+---+



In [6]:
from pyspark.sql.functions import stddev, mean, col

(samples.select(mean("age").alias("mean_age"), 
    stddev("age").alias("stddev_age")
    )
.crossJoin(samples)
.withColumn("age_scaled" , (col("age") - col("mean_age")) / col("stddev_age"))
) \
.show(truncate=False)

+--------+------------------+----+---+-------------------+
|mean_age|stddev_age        |name|age|age_scaled         |
+--------+------------------+----+---+-------------------+
|5.0     |3.9157800414902435|alex|1  |-1.0215078369104984|
|5.0     |3.9157800414902435|bob |3  |-0.5107539184552492|
|5.0     |3.9157800414902435|ali |6  |0.2553769592276246 |
|5.0     |3.9157800414902435|dave|10 |1.276884796138123  |
+--------+------------------+----+---+-------------------+



In [8]:
mean_age, sttdev_age = samples.select(mean("age"), stddev("age")).first()
samples.withColumn("age_scaled", (col("age") - mean_age) / sttdev_age).show(truncate=False)

+----+---+-------------------+
|name|age|age_scaled         |
+----+---+-------------------+
|alex|1  |-1.0215078369104984|
|bob |3  |-0.5107539184552492|
|ali |6  |0.2553769592276246 |
|dave|10 |1.276884796138123  |
+----+---+-------------------+



# Vector Assembler


In [10]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=['age'], outputCol="age_vector")
samples2 = vecAssembler.transform(samples)
samples2.show()

+----+---+----------+
|name|age|age_vector|
+----+---+----------+
|alex|  1|     [1.0]|
| bob|  3|     [3.0]|
| ali|  6|     [6.0]|
|dave| 10|    [10.0]|
+----+---+----------+



In [11]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="age_vector", outputCol="age_scaled",
  withStd=True, withMean=True)
scalerModel = scaler.fit(samples2)
scaledData = scalerModel.transform(samples2)
scaledData.show(truncate=False)

+----+---+----------+---------------------+
|name|age|age_vector|age_scaled           |
+----+---+----------+---------------------+
|alex|1  |[1.0]     |[-1.0215078369104984]|
|bob |3  |[3.0]     |[-0.5107539184552492]|
|ali |6  |[6.0]     |[0.2553769592276246] |
|dave|10 |[10.0]    |[1.276884796138123]  |
+----+---+----------+---------------------+

