In [3]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

In [4]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

## PCA

In [4]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = sql.createDataFrame(data, ["features"])
df.toPandas()

Unnamed: 0,features
0,"(0.0, 1.0, 0.0, 7.0, 0.0)"
1,"[2.0, 0.0, 3.0, 4.0, 5.0]"
2,"[4.0, 0.0, 0.0, 6.0, 7.0]"


In [5]:
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

In [6]:
res = model.transform(df).select("pcaFeatures")
res.show(truncate=False)

+-----------------------------------------------------------+
|pcaFeatures                                                |
+-----------------------------------------------------------+
|[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+-----------------------------------------------------------+



In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [9]:
df = sql.createDataFrame([(0, "a"),(1, "b"),(2, "c"),(3, "a"),(4, "a"),(5, "c")], ["id", "category"])
df.toPandas()

Unnamed: 0,id,category
0,0,a
1,1,b
2,2,c
3,3,a
4,4,a
5,5,c


In [11]:
df = sql.createDataFrame([(0.0, 1.0),(1.0, 0.0),(2.0, 1.0),(0.0, 2.0),(0.0, 1.0),(2.0, 0.0)], ["categoryIndex1", "categoryIndex2"])
df.toPandas()

Unnamed: 0,categoryIndex1,categoryIndex2
0,0.0,1.0
1,1.0,0.0
2,2.0,1.0
3,0.0,2.0
4,0.0,1.0
5,2.0,0.0


In [12]:
encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



## Normalizer

In [13]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

In [14]:
dataFrame = sql.createDataFrame([(0, Vectors.dense([1.0, 0.5, -1.0]),),(1, Vectors.dense([2.0, 1.0, 1.0]),),(2, Vectors.dense([4.0, 10.0, 2.0]),)], ["id", "features"])
df.toPandas()

Unnamed: 0,categoryIndex1,categoryIndex2
0,0.0,1.0
1,1.0,0.0
2,2.0,1.0
3,0.0,2.0
4,0.0,1.0
5,2.0,0.0


#### Normalize each Vector using $L^1$ norm.

In [17]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

Normalized using L^1 norm
+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+



#### Normalize each Vector using $L^\infty$ norm.

In [18]:
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()

Normalized using L^inf norm
+---+--------------+--------------+
| id|      features|  normFeatures|
+---+--------------+--------------+
|  0|[1.0,0.5,-1.0]|[1.0,0.5,-1.0]|
|  1| [2.0,1.0,1.0]| [1.0,0.5,0.5]|
|  2|[4.0,10.0,2.0]| [0.4,1.0,0.2]|
+---+--------------+--------------+



## Scalers

In [19]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

In [20]:
df = sql.createDataFrame([(0, Vectors.dense([1.0, 0.1, -1.0]),),(1, Vectors.dense([2.0, 1.1, 1.0]),),(2, Vectors.dense([3.0, 10.1, 3.0]),)], ["id", "features"])
df.toPandas()

Unnamed: 0,id,features
0,0,"[1.0, 0.1, -1.0]"
1,1,"[2.0, 1.1, 1.0]"
2,2,"[3.0, 10.1, 3.0]"


#### MinMax Scaler

In [21]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(df)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(df)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

Features scaled to range: [0.000000, 1.000000]
+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]|     (3,[],[])|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



#### Standard Scaler

In [22]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df)
scaledData.show()

+---+--------------+--------------------+
| id|      features|      scaledFeatures|
+---+--------------+--------------------+
|  0|[1.0,0.1,-1.0]|[1.0,0.0181568259...|
|  1| [2.0,1.1,1.0]|[2.0,0.1997250857...|
|  2|[3.0,10.1,3.0]|[3.0,1.8338394239...|
+---+--------------+--------------------+



## Binarizer

In [5]:
from pyspark.ml.feature import Binarizer

df = sql.createDataFrame([(0, 0.1),(1, 0.8),(2, 0.2)], ["id", "feature"])
df.toPandas()

Unnamed: 0,id,feature
0,0,0.1
1,1,0.8
2,2,0.2


In [7]:
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

df_bin = binarizer.transform(df)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
df_bin.show()

Binarizer output with Threshold = 0.500000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    0.1|              0.0|
|  1|    0.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



## Bucketizer

In [23]:
from pyspark.ml.feature import Bucketizer

In [25]:
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
df = sql.createDataFrame(data, ["features"])
df.toPandas()

Unnamed: 0,features
0,-999.9
1,-0.5
2,-0.3
3,0.0
4,0.2
5,999.9


In [27]:
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(df)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



In [28]:
spark.stop()

## Credits & Links

http://spark.apache.org/docs/2.2.0/ml-features.html