In [1]:
SparkSession

pyspark.sql.session.SparkSession

In [2]:
spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [4]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"])

In [5]:
training.printSchema()

root
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)



In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingtf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

In [7]:
LRpipeline = Pipeline(stages=[tokenizer, hashingtf, lr])

In [8]:
model = LRpipeline.fit(training)

In [9]:
    pretest = spark.createDataFrame([
        (4, "spark i j k"),
        (5, "l m n"),
        (6, "spark hadoop spark"),
        (7, "apache hadoop")], ["id", "text"])

In [10]:
prediction = model.transform(test)

In [11]:
selected = prediction.select("id", "text", "probability", "prediction")

In [12]:
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

# %d int rid, %s str text, %s str str(prob), %f float prediction

(4, spark i j k) --> prob=[0.159640773879,0.840359226121], prediction=1.000000
(5, l m n) --> prob=[0.837832568548,0.162167431452], prediction=0.000000
(6, spark hadoop spark) --> prob=[0.0692663313298,0.93073366867], prediction=1.000000
(7, apache hadoop) --> prob=[0.982157533344,0.0178424666556], prediction=0.000000
