In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer

In [2]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

#### Load data

In [4]:
df = sql.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),(0.0, Vectors.dense([2.0, 1.0, -1.0])),(0.0, Vectors.dense([2.0, 1.3, 1.0])),(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
df.toPandas()

Unnamed: 0,label,features
0,1.0,"[0.0, 1.1, 0.1]"
1,0.0,"[2.0, 1.0, -1.0]"
2,0.0,"[2.0, 1.3, 1.0]"
3,1.0,"[0.0, 1.2, -0.5]"


#### Estimator : Logistic regression

In [5]:
lr = LogisticRegression(maxIter=10, regParam=0.01)
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

lr_fit = lr.fit(df)

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

Since lr_fit is a Model (i.e., a transformer produced by an Estimator), we can view the parameters it used during fit()

In [7]:
lr_fit.extractParamMap()

{Param(parent='LogisticRegression_7b7d83cb630b', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_7b7d83cb630b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_7b7d83cb630b', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LogisticRegression_7b7d83cb630b', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_7b7d83cb630b', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LogisticRegression_7b7d83cb630b', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='LogisticRegression_7b7d83cb630b', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These

We may alternatively specify parameters using a Python dictionary as a paramMap

In [8]:
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # Specify multiple Params.

You can combine paramMaps, which are python dictionaries

In [11]:
paramMap2 = {lr.probabilityCol: "myProbability"}  # Change output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

Learn a new model using the paramMapCombined parameters. paramMapCombined overrides all parameters set earlier via lr.set* methods.

In [12]:
lr_fit2 = lr.fit(df, paramMapCombined)
print("Model 2 was fit using parameters: ")
print(lr_fit2.extractParamMap())

Model 2 was fit using parameters: 
{Param(parent='LogisticRegression_7b7d83cb630b', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_7b7d83cb630b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_7b7d83cb630b', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_7b7d83cb630b', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_7b7d83cb630b', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_7b7d83cb630b', name='predictionCol', doc='prediction column name.'): 'prediction', Param(parent='LogisticRegression_7b7d83cb630b', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated

#### Test

In [18]:
test = sql.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

pred = lr_fit2.transform(test)
res = pred.select("features", "label", "myProbability", "prediction").collect()
 
print('features \tlabel \tmyProb                                      \tpred')
for row in res:
    print(row.features, '\t', row.label, '\t', row.myProbability, '\t', row.prediction)

features 	label 	myProb                                      	pred
[-1.0,1.5,1.3] 	 1.0 	 [0.057073041710340625,0.9429269582896593] 	 1.0
[3.0,2.0,-0.1] 	 0.0 	 [0.9238522311704118,0.07614776882958811] 	 0.0
[0.0,2.2,-1.5] 	 1.0 	 [0.10972776114779748,0.8902722388522026] 	 1.0


In [9]:
spark.stop()

## Credits & Links

http://spark.apache.org/docs/2.2.0/ml-pipeline.html