In [1]:
import findspark
findspark.init('/opt/cloudera/parcels/SPARK2/lib/spark2/')

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('emre').getOrCreate()

In [5]:
spark

In [6]:
training = spark.createDataFrame([
    (0, "s p a r k", 1.0),
    (1, "h a d o o p", 0.0),
    (2, "s p a ", 1.0),
    (3, "h a d", 0.0),
    (4, "h p a", 0.0) ], ["id","text","label"])

In [7]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam = 0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [9]:
model = pipeline.fit(training)

In [21]:
test = spark.createDataFrame([
    (5, "s r k"),
    (6, "h d p"),
    (7, "s p"),
    (8, "h o"),
    (9, "p a"),
    (10, "h"),
    (11, "b a"),
    (11, "a"),
    (11, "k") ], ["id", "text"] ) 

In [22]:
prediction = model.transform(test)
prediction.show()

+---+-----+---------+--------------------+--------------------+--------------------+----------+
| id| text|    words|            features|       rawPrediction|         probability|prediction|
+---+-----+---------+--------------------+--------------------+--------------------+----------+
|  5|s r k|[s, r, k]|(262144,[94533,18...|[-4.1608207512036...|[0.01535529127490...|       1.0|
|  6|h d p|[h, d, p]|(262144,[15554,27...|[4.25643402140950...|[0.98602530771388...|       0.0|
|  7|  s p|   [s, p]|(262144,[94533,21...|[-3.1757954008034...|[0.04008681384656...|       1.0|
|  8|  h o|   [h, o]|(262144,[15554,25...|[4.25344479429933...|[0.98598405805862...|       0.0|
|  9|  p a|   [p, a]|(262144,[213268,2...|[0.02378038883240...|[0.50594481705782...|       0.0|
| 10|    h|      [h]|(262144,[15554],[...|[3.91481408071616...|[0.98044573887328...|       0.0|
| 11|  b a|   [b, a]|(262144,[30913,22...|[0.62478957358150...|[0.65130707705009...|       0.0|
| 11|    a|      [a]|(262144,[227410],..