In [4]:
import pyspark
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [3]:
df = spark.createDataFrame([['a'],['b'],['c'],['a'],['a'],['c']],['cate_col'])
df.show()

+--------+
|cate_col|
+--------+
|       a|
|       b|
|       c|
|       a|
|       a|
|       c|
+--------+



### StringIndexer

- Label categorical column to indices (0,1,2......)
- Indices base on frequency of the value 
    (e.g. There are 3'a's, 2'c's and 1'b', a is labelled 0, c labelled 1 and b labelled 2) 

In [5]:
stridx = StringIndexer(inputCol='cate_col',outputCol='cate_col_idx')
stridxModel = stridx.fit(df)
df_idx = stridxModel.transform(df)

In [7]:
df_idx.show()

+--------+------------+
|cate_col|cate_col_idx|
+--------+------------+
|       a|         0.0|
|       b|         2.0|
|       c|         1.0|
|       a|         0.0|
|       a|         0.0|
|       c|         1.0|
+--------+------------+



### OneHotEncoder
map a column of label indices to binary vectors. (Dummy variable in vector form)

The last label is removed.

    0 -> [1,0]

    1 -> [0,1]

    2 -> [ , ]    **Last label is empty

##### Sparse Matrix: 
`( length-of-vector, position-of-non-zero, value )`

`(5, [0,3], [1,9])` means vector with length 5 with `1,9` at position `0,3` respectively: `[1,0,0,9,0]`

In [8]:
OHE = OneHotEncoder(inputCol='cate_col_idx', outputCol='cate_col_encoded')
df_encoded = OHE.transform(df_idx)

In [9]:
df_encoded.show()

+--------+------------+----------------+
|cate_col|cate_col_idx|cate_col_encoded|
+--------+------------+----------------+
|       a|         0.0|   (2,[0],[1.0])|
|       b|         2.0|       (2,[],[])|
|       c|         1.0|   (2,[1],[1.0])|
|       a|         0.0|   (2,[0],[1.0])|
|       a|         0.0|   (2,[0],[1.0])|
|       c|         1.0|   (2,[1],[1.0])|
+--------+------------+----------------+

