In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("Tutorial2_DataFrames")\
        .getOrCreate()

22/11/17 20:15:55 WARN Utils: Your hostname, bigdata-vmware resolves to a loopback address: 127.0.1.1; using 192.168.10.135 instead (on interface ens33)
22/11/17 20:15:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/17 20:15:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


A vector can be represented in dense and sparse formats.

In [5]:

# A dense vector is a regular vector that has each elements printed.

from pyspark.ml.linalg import Vectors, DenseVector

dv = DenseVector([1.0,0.,0.,0.,4.5,0])

# Alternative
#dv = Vectors.dense([1.0,0.,0.,0.,4.5,0])

dv

DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])

In [7]:
# A sparse vector is a vector where most elements are 0.

from pyspark.ml.linalg import Vectors, SparseVector

#v = Vectors.sparse(6, {0:1.0, 4:4.5}) 

# The first argument is the vector size, the second argument is a dictionary. The keys are indices of active elements and the values are values of active elements.

# In the example above, all the elements are 0 except the elements with indices 0 and 4, which have values 1.0 and 4.5, respectively.

#Alternative
v = Vectors.sparse(6, [(0, 1.0), (4, 4.5)])

v

SparseVector(6, {0: 1.0, 4: 4.5})

Basic statistics

In [2]:
# Correlation computes the correlation matrix for the input Dataset of Vectors using the specified method. The output will be a DataFrame that contains the correlation matrix of the column of vectors.

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
        
df = spark.createDataFrame(data, ["features"])

df.show()

r1 = Correlation.corr(df, "features").head()

print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()

print("Spearman correlation matrix:\n" + str(r2[0]))

                                                                                

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



                                                                                

22/11/17 20:19:19 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])
22/11/17 20:19:22 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])


In [34]:
# ChiSquareTest conducts Pearson’s independence test for every feature against the label. For each feature, the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared statistic is computed. All label and feature values must be categorical. 

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(0.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(0.5, 30.0)),
        (0.0, Vectors.dense(0.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]

df = spark.createDataFrame(data, ["label", "features"])

#df.show()

r = ChiSquareTest.test(df, "features", "label").head()

print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

# Alternative:
#r = ChiSquareTest.test(df, "features", "label")
#r.show(truncate=False)

#----------------------

# p < 0,05 is the usual test for dependence.
# In this case, we get
# pValues: [0.04978706836786395,0.6822703303362126]
# which means that 
# - the first feature and the label are dependent
# - the second feature and label are independent

# When doing feature selection, we aim to select the features which are highly dependent on the label.


pValues: [0.04978706836786395,0.6822703303362126]
degreesOfFreedom: [2, 3]
statistics: [6.000000000000001,1.5]


In [3]:
#Summarize

from pyspark.ml.stat import Summarizer
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

sc = spark.sparkContext

df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 3.0, 1.0)),
                     Row(weight=0.5, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()

df.show()

# create summarizer for multiple metrics "mean" and "count"
summarizer = Summarizer.metrics("mean", "count", "max")

# compute statistics for multiple metrics with weight
df.select(summarizer.summary(df.features, df.weight)).show(truncate=False)

# compute statistics for multiple metrics without weight
df.select(summarizer.summary(df.features)).show(truncate=False)

# compute statistics for single metric "mean" with weight
df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False)

# compute statistics for single metric "mean" without weight
df.select(Summarizer.mean(df.features)).show(truncate=False)

#See
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.stat.Summarizer.html?highlight=summary
# for other metrics

                                                                                

+------+-------------+
|weight|     features|
+------+-------------+
|   1.0|[1.0,3.0,1.0]|
|   0.5|[1.0,2.0,3.0]|
+------+-------------+



                                                                                

+---------------------------------------------------------------+
|aggregate_metrics(features, weight)                            |
+---------------------------------------------------------------+
|{[1.0,2.6666666666666665,1.6666666666666665], 2, [1.0,3.0,3.0]}|
+---------------------------------------------------------------+

+---------------------------------+
|aggregate_metrics(features, 1.0) |
+---------------------------------+
|{[1.0,2.5,2.0], 2, [1.0,3.0,3.0]}|
+---------------------------------+

+-------------------------------------------+
|mean(features)                             |
+-------------------------------------------+
|[1.0,2.6666666666666665,1.6666666666666665]|
+-------------------------------------------+

+--------------+
|mean(features)|
+--------------+
|[1.0,2.5,2.0] |
+--------------+



Fitting a model

In [20]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Print out the parameters, documentation, and any default values.
#print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model = lr.fit(training)
#model

# Prepare test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.

prediction = model.transform(test) #prediction is a DataFrame


prediction.show(truncate=False)

#Alternative
#result = prediction.select("features", "label", "probability", "prediction").collect()
#for row in result:
#    print("features=%s, label=%s -> prob=%s, prediction=%s"
#          % (row.features, row.label, row.probability, row.prediction))

+-----+--------------+--------------------------------------+------------------------------------------+----------+
|label|features      |rawPrediction                         |probability                               |prediction|
+-----+--------------+--------------------------------------+------------------------------------------+----------+
|1.0  |[-1.0,1.5,1.3]|[-6.24352818212715,6.24352818212715]  |[0.0019392203169556112,0.9980607796830444]|1.0       |
|0.0  |[3.0,2.0,-0.1]|[5.452313886389296,-5.452313886389296]|[0.995731919571047,0.004268080428952992]  |0.0       |
|1.0  |[0.0,2.2,-1.5]|[-4.410385582866056,4.410385582866056]|[0.012004630236370896,0.9879953697636291] |1.0       |
+-----+--------------+--------------------------------------+------------------------------------------+----------+



Pipelines

In [64]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001, )
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k", 1.0),
    (5, "l m n", 0.0),
    (6, "spark hadoop spark", 1.0),
    (7, "apache hadoop", 0.0)
], ["id", "text", "label"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
selected.show()

#for row in selected.collect():
#    rid, text, prob, prediction = row
#    print(
#        "(%d, %s) --> prob=%s, prediction=%f" % (
#            rid, text, str(prob), prediction
#        )
#    )

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)

print(accuracy)

+---+------------------+--------------------+----------+
| id|              text|         probability|prediction|
+---+------------------+--------------------+----------+
|  4|       spark i j k|[0.62920984896684...|       0.0|
|  5|             l m n|[0.98477000676230...|       0.0|
|  6|spark hadoop spark|[0.13412348342566...|       1.0|
|  7|     apache hadoop|[0.99557321143985...|       0.0|
+---+------------------+--------------------+----------+

0.7333333333333334


In [59]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql.functions import *


# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized = tokenizer.transform(training)
#tokenized.show()

hashingTF = HashingTF(inputCol="words", outputCol="features")
featureVectors = hashingTF.transform(tokenized)
#featureVectors.show() 

lr = LogisticRegression(
                labelCol="label",
                featuresCol="features",
                maxIter=10, 
                regParam=0.001)

# Fit the pipeline to training documents.
model = lr.fit(featureVectors)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k", 1.0),
    (5, "l m n", 0.0),
    (6, "spark hadoop spark", 1.0),
    (7, "apache hadoop", 0.0)
], ["id", "text", "label"])

tokenizer_test = Tokenizer(inputCol="text", outputCol="words")
tokenized_test = tokenizer.transform(test)

hashingTF_test = HashingTF(inputCol="words", outputCol="features")
featureVectors_test = hashingTF.transform(tokenized_test)

# Make predictions on test documents and print columns of interest.
prediction = model.transform(featureVectors)

selected = prediction.select("id", "text", "probability", "prediction")
selected.show(truncate=False)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)

print(accuracy)

                                                                                

+---+----------------+------------------------------------------+----------+
|id |text            |probability                               |prediction|
+---+----------------+------------------------------------------+----------+
|0  |a b c d e spark |[0.0026282134969420287,0.997371786503058] |1.0       |
|1  |b d             |[0.9963902711801113,0.0036097288198887467]|0.0       |
|2  |spark f g h     |[0.0022081050570269918,0.997791894942973] |1.0       |
|3  |hadoop mapreduce|[0.9987232337063715,0.0012767662936284951]|0.0       |
+---+----------------+------------------------------------------+----------+

1.0


Feature extraction

In [9]:
from pyspark.ml.feature import FeatureHasher
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dataset = spark.createDataFrame([
    (1.0, 2.2, True, "1", "foo"),
    (0.0, 3.3, False, "2", "bar"),
    (0.0, 4.4, False, "3", "baz"),
    (1.0, 5.5, False, "4", "foo")
], ["label", "real", "bool", "stringNum", "string"])

hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

featurized = hasher.transform(dataset)
featurized.show(truncate=False)

lr = LogisticRegression(
                labelCol="label",
                featuresCol="features",
                maxIter=10, 
                regParam=0.001)

model = lr.fit(featurized)


test = spark.createDataFrame([
    (1.0, 2.2, True, "2", "foo"),
    (1.0, 3.3, False, "2", "foo"),
    (0.0, 4.4, False, "3", "baz"),
    (1.0, 5.5, False, "4", "foo")
], ["label", "real", "bool", "stringNum", "string"])

featurized_test = hasher.transform(test)

# Make predictions on test documents and print columns of interest.
prediction = model.transform(featurized_test)

selected = prediction.select("label", "real", "bool", "stringNum", "string", "probability", "prediction")
selected.show(truncate=False)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)

print(accuracy)

+-----+----+-----+---------+------+--------------------------------------------------------+
|label|real|bool |stringNum|string|features                                                |
+-----+----+-----+---------+------+--------------------------------------------------------+
|1.0  |2.2 |true |1        |foo   |(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])|
|0.0  |3.3 |false|2        |bar   |(262144,[70644,89673,173866,174475],[1.0,1.0,1.0,3.3])  |
|0.0  |4.4 |false|3        |baz   |(262144,[22406,70644,174475,187923],[1.0,1.0,4.4,1.0])  |
|1.0  |5.5 |false|4        |foo   |(262144,[70644,101499,174475,257907],[1.0,1.0,5.5,1.0]) |
+-----+----+-----+---------+------+--------------------------------------------------------+



                                                                                

+-----+----+-----+---------+------+------------------------------------------+----------+
|label|real|bool |stringNum|string|probability                               |prediction|
+-----+----+-----+---------+------+------------------------------------------+----------+
|1.0  |2.2 |true |2        |foo   |[0.07135878673268206,0.928641213267318]   |1.0       |
|1.0  |3.3 |false|2        |foo   |[0.5698424077277565,0.4301575922722435]   |0.0       |
|0.0  |4.4 |false|3        |baz   |[0.997587266615691,0.0024127333843090293] |0.0       |
|1.0  |5.5 |false|4        |foo   |[0.0031188832293638107,0.9968811167706362]|1.0       |
+-----+----+-----+---------+------+------------------------------------------+----------+

0.75


In [4]:
# Binarization is the process of thresholding numerical features to binary (0/1) features.

from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([
    (0, 0.1),
    (1, 0.8),
    (2, 0.2)
], ["id", "feature"])
continuousDataFrame.show()

binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

+---+-------+
| id|feature|
+---+-------+
|  0|    0.1|
|  1|    0.8|
|  2|    0.2|
+---+-------+

Binarizer output with Threshold = 0.500000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    0.1|              0.0|
|  1|    0.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



In [6]:
# One hot encoding

from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

                                                                                

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [31]:
# StandardScaler: Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.

from pyspark.ml.feature import StandardScaler

df = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df)
scaledData.show(truncate=False)

+---+--------------+------------------------------------------------------------+
|id |features      |scaledFeatures                                              |
+---+--------------+------------------------------------------------------------+
|0  |[1.0,0.5,-1.0]|[0.6546536707079771,0.09352195295828246,-0.6546536707079772]|
|1  |[2.0,1.0,1.0] |[1.3093073414159542,0.18704390591656492,0.6546536707079772] |
|2  |[4.0,10.0,2.0]|[2.6186146828319083,1.8704390591656492,1.3093073414159544]  |
+---+--------------+------------------------------------------------------------+



In [28]:
# RobustScaler: transforms a dataset of Vector rows, removing the median and scaling the data according to a specific quantile range (by default the IQR: Interquartile Range, quantile range between the 1st quartile and the 3rd quartile). Its behavior is quite similar to StandardScaler, however the median and the quantile range are used instead of mean and standard deviation, which make it robust to outliers.

from pyspark.ml.feature import RobustScaler

df = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

scaler = RobustScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics by fitting the RobustScaler
scalerModel = scaler.fit(df)

# Transform each feature to have unit quantile range.
scaledData = scalerModel.transform(df)
scaledData.show(truncate=False)

#See also MinMaxScaler and other normalizers

+---+--------------+------------------------------------------------------------+
|id |features      |scaledFeatures                                              |
+---+--------------+------------------------------------------------------------+
|0  |[1.0,0.5,-1.0]|[0.3333333333333333,0.05263157894736842,-0.3333333333333333]|
|1  |[2.0,1.0,1.0] |[0.6666666666666666,0.10526315789473684,0.3333333333333333] |
|2  |[4.0,10.0,2.0]|[1.3333333333333333,1.0526315789473684,0.6666666666666666]  |
+---+--------------+------------------------------------------------------------+



In [36]:
# Bucketizer transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users.

from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]

df = spark.createDataFrame(data, ["features"])
df.printSchema()

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(df)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
bucketedData.show()

# See also QuantileDiscretizer

root
 |-- features: double (nullable = true)

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



In [None]:
# VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order.

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

df = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

output = assembler.transform(df)
print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
output.select("features", "clicked").show(truncate=False)

In [3]:
# PCA: projects vectors to a lower dimensional space of the top k principal components

from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

                                                                                

22/11/16 19:13:54 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/11/16 19:13:54 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
+------------------------------------------------------------+
|pcaFeatures                                                 |
+------------------------------------------------------------+
|[1.6485728230883814,-4.0132827005162985,-1.0091435193998504]|
|[-4.645104331781533,-1.1167972663619048,-1.0091435193998504]|
|[-6.428880535676488,-5.337951427775359,-1.0091435193998508] |
+------------------------------------------------------------+



In [5]:
# Perform feature expansion in a polynomial space. Take a 2-variable feature vector as an example: (x, y), if we want to expand it with degree 2, then we get (x, x * x, y, x * y, y * y).

from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.dense([2.0, 1.0]),),
    (Vectors.dense([0.0, 0.0]),),
    (Vectors.dense([3.0, -1.0]),)
], ["features"])

polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)

+----------+------------------------------------------+
|features  |polyFeatures                              |
+----------+------------------------------------------+
|[2.0,1.0] |[2.0,4.0,8.0,1.0,2.0,4.0,1.0,2.0,1.0]     |
|[0.0,0.0] |[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]     |
|[3.0,-1.0]|[3.0,9.0,27.0,-1.0,-3.0,-9.0,1.0,3.0,-1.0]|
+----------+------------------------------------------+



Feature selectors

In [38]:
# VectorSlicer is a transformer that takes a feature vector and outputs a new feature vector with a sub-array of the original features. It is useful for extracting features from a vector column.

from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1, 2])

output = slicer.transform(df)

output.select("userFeatures", "features").show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(2,[0],[2.3])|
|      [-2.0,2.3,0.0]|    [2.3,0.0]|
+--------------------+-------------+



In [41]:
#ChiSqSelector stands for Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the Chi-Squared test of independence to decide which features to choose. It supports five selection methods: numTopFeatures, percentile, fpr, fdr, fwe:
#- numTopFeatures chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
#- percentile is similar to numTopFeatures but chooses a fraction of all features instead of a fixed number.
#- fpr chooses all features whose p-values are below a threshold, thus controlling the false positive rate of selection.
#- fdr uses the Benjamini-Hochberg procedure to choose all features whose false discovery rate is below a threshold.
#- fwe chooses all features whose p-values are below a threshold. The threshold is scaled by 1/numFeatures, thus controlling the family-wise error rate of selection. By default, the selection method is numTopFeatures, with the default number of top features set to 50. The user can choose a selection method using setSelectorType.

from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
    (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
    (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=2, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="clicked")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

# See also UnivariateFeatureSelector and VarianceThresholdSelector

ChiSqSelector output with top 2 features selected
+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|      [18.0,1.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|      [12.0,0.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|      [15.0,0.1]|
+---+------------------+-------+----------------+



In [43]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("data/sample_libsvm_data.txt")
data.show(5, truncate=False)
data.printSchema()

22/11/17 00:41:09 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Bibliography:
- https://spark.apache.org/docs/latest/ml-guide.html
- https://spark.apache.org/docs/3.1.3/api/python/ 
- https://sparkbyexamples.com/