In [1]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

In [2]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [4]:
ds = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = sqc.createDataFrame(ds, ["features"])
df.toPandas()

Unnamed: 0,features
0,"(0.0, 1.0, 0.0, 7.0, 0.0)"
1,"[2.0, 0.0, 3.0, 4.0, 5.0]"
2,"[4.0, 0.0, 0.0, 6.0, 7.0]"


In [5]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

In [6]:
res = model.transform(df).select("pcaFeatures")
res.show(truncate=False)

+-----------------------------------------------------------+
|pcaFeatures                                                |
+-----------------------------------------------------------+
|[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+-----------------------------------------------------------+



In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [8]:
df = sqc.createDataFrame([(0, "a"),(1, "b"),(2, "c"),(3, "a"),(4, "a"),(5, "c")], ["id", "category"])
df.toPandas()

Unnamed: 0,id,category
0,0,a
1,1,b
2,2,c
3,3,a
4,4,a
5,5,c


In [9]:
from pyspark.ml.feature import OneHotEncoder

In [12]:
df = sqc.createDataFrame([(0.0, 1.0),(1.0, 0.0),(2.0, 1.0),(0.0, 2.0),(0.0, 1.0),(2.0, 0.0)], ["categoryIndex1", "categoryIndex2"])
df.toPandas()

Unnamed: 0,categoryIndex1,categoryIndex2
0,0.0,1.0
1,1.0,0.0
2,2.0,1.0
3,0.0,2.0
4,0.0,1.0
5,2.0,0.0


In [13]:
encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



## Normalizer

In [27]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

In [29]:
dataFrame = sqc.createDataFrame([(0, Vectors.dense([1.0, 0.5, -1.0]),),(1, Vectors.dense([2.0, 1.0, 1.0]),),(2, Vectors.dense([4.0, 10.0, 2.0]),)], ["id", "features"])
df.toPandas()

Unnamed: 0,id,category
0,0,a
1,1,b
2,2,c
3,3,a
4,4,a
5,5,c


In [30]:
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()

Normalized using L^1 norm
+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+

Normalized using L^inf norm
+---+--------------+--------------+
| id|      features|  normFeatures|
+---+--------------+--------------+
|  0|[1.0,0.5,-1.0]|[1.0,0.5,-1.0]|
|  1| [2.0,1.0,1.0]| [1.0,0.5,0.5]|
|  2|[4.0,10.0,2.0]| [0.4,1.0,0.2]|
+---+--------------+--------------+



## Scalers

In [36]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

In [34]:
df = sqc.createDataFrame([(0, Vectors.dense([1.0, 0.1, -1.0]),),(1, Vectors.dense([2.0, 1.1, 1.0]),),(2, Vectors.dense([3.0, 10.1, 3.0]),)], ["id", "features"])
df.toPandas()

Unnamed: 0,id,features
0,0,"[1.0, 0.1, -1.0]"
1,1,"[2.0, 1.1, 1.0]"
2,2,"[3.0, 10.1, 3.0]"


#### MinMax Scaler

In [38]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(df)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(df)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

Features scaled to range: [0.000000, 1.000000]
+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]|     (3,[],[])|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



#### Standard Scaler

In [39]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df)
scaledData.show()

+---+--------------+--------------------+
| id|      features|      scaledFeatures|
+---+--------------+--------------------+
|  0|[1.0,0.1,-1.0]|[1.0,0.0181568259...|
|  1| [2.0,1.1,1.0]|[2.0,0.1997250857...|
|  2|[3.0,10.1,3.0]|[3.0,1.8338394239...|
+---+--------------+--------------------+



## Bucketizer

In [40]:
from pyspark.ml.feature import Bucketizer

In [42]:
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
df = sqc.createDataFrame(data, ["features"])
df.toPandas()

Unnamed: 0,features
0,-999.9
1,-0.5
2,-0.3
3,0.0
4,0.2
5,999.9


In [43]:
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

IllegalArgumentException: requirement failed: Column features must be of type numeric but was actually of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.

In [16]:
sc.stop()

## Credits & Links

http://spark.apache.org/docs/2.2.0/ml-features.html