In [1]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
     .master('local[1]') \
     .appName('Sentiment Prediction') \
     .enableHiveSupport() \
     .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [3]:
df = spark.sql('SELECT * FROM sentiment_table')
df.count()

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
                                                                                

6733

In [4]:
df = df.na.drop()
df = df.na.replace(-1, 0)
df = df.withColumn("sentiment", df.sentiment.cast('double'))
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)



In [5]:
df.show()

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|  everyone portfolio|      1.0|
|one convinced inv...|      0.0|
|think dependable ...|      1.0|
|bear market nothi...|      1.0|
|posted regulation...|      1.0|
|value way much le...|      0.0|
|could imagine one...|      0.0|
|going trust horri...|      0.0|
|hope love many pe...|      1.0|
|longer trust cent...|      1.0|
|        short invest|      1.0|
|feel sorry anyone...|      0.0|
|pretty stable w w...|      1.0|
|even know people ...|      0.0|
|funny people thin...|      0.0|
|     lose money time|      0.0|
|             buy sol|      1.0|
|good time accumulate|      1.0|
|plain simple paym...|      1.0|
|great way bring m...|      1.0|
+--------------------+---------+
only showing top 20 rows



In [6]:
train, test = df.randomSplit([0.7, 0.3])

In [7]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001, labelCol='sentiment')
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(train)

                                                                                

In [8]:
model.save("models/sentiment_model")

[Stage 34:>                                                         (0 + 1) / 1]                                                                                

In [9]:
loaded_model = PipelineModel.load("models/sentiment_model")

In [10]:
# Make predictions on test documents and print columns of interest.
prediction = loaded_model.transform(test)
prediction.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [11]:
eval = BinaryClassificationEvaluator(labelCol="sentiment")
eval.evaluate(prediction)

                                                                                

0.7423746301365189