In [2]:
import findspark
findspark.init("C:\\spark")

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession

In [16]:
spark = SparkSession.builder.appName("first_example").getOrCreate()
spark

In [5]:
training = spark.createDataFrame([
    (0, "s p a r k", 1.0),
    (1, "h a d o o p", 0.0),
    (2, "s p a", 1.0),
    (3, "h a d", 0.0),
    (4, "h p a", 0.0)
], ["id", "text", "label"])

In [7]:
tokenizer = Tokenizer(inputCol= "text", outputCol = "words")
hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol = "features")
lr = LogisticRegression(maxIter = 10, regParam = 0.01)
pipeline = Pipeline(stages = [tokenizer, hashingTF, lr])

In [9]:
print("LogisticRegression parameters: \n" + lr.explainParams() + "\n")

LogisticRegression parameters: 
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
maxIter: max number of iterations (>= 0). (default: 100, current: 10)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
rawPredictionCol: raw pred

In [10]:
model = pipeline.fit(training)

In [11]:
test = spark.createDataFrame([
    (5, "s r k"),
    (6, "h d p"),
    (7, "s p"),
    (8, "h o"),
    (9, "h d p"),
    (10, "p a")
],  ["id", "text"])

In [13]:
prediction = model.transform(test)
prediction.show()

+---+-----+---------+--------------------+--------------------+--------------------+----------+
| id| text|    words|            features|       rawPrediction|         probability|prediction|
+---+-----+---------+--------------------+--------------------+--------------------+----------+
|  5|s r k|[s, r, k]|(262144,[94533,18...|[-4.1961909513595...|[0.01482957788594...|       1.0|
|  6|h d p|[h, d, p]|(262144,[15554,27...|[4.27395562512471...|[0.98626470012802...|       0.0|
|  7|  s p|   [s, p]|(262144,[94533,21...|[-3.1856083796668...|[0.03971091129109...|       1.0|
|  8|  h o|   [h, o]|(262144,[15554,25...|[4.23296628247586...|[0.98569822062977...|       0.0|
|  9|h d p|[h, d, p]|(262144,[15554,27...|[4.27395562512471...|[0.98626470012802...|       0.0|
| 10|  p a|   [p, a]|(262144,[213268,2...|[0.00540714716231...|[0.50135178349704...|       0.0|
+---+-----+---------+--------------------+--------------------+--------------------+----------+



In [15]:
selected = prediction.select("id", "text", "prediction")
for row in selected.collect():
    print(row)

Row(id=5, text='s r k', prediction=1.0)
Row(id=6, text='h d p', prediction=0.0)
Row(id=7, text='s p', prediction=1.0)
Row(id=8, text='h o', prediction=0.0)
Row(id=9, text='h d p', prediction=0.0)
Row(id=10, text='p a', prediction=0.0)
