In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.linalg import Vector
from pyspark.sql import Row
from pyspark.sql.types import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1481496952186_0007,pyspark,idle,Link,Link,✔


SparkContext available as 'sc'.
HiveContext available as 'sqlContext'.


In [2]:
trainingrdd = sc.textFile("wasb:///training-tweets.csv").zipWithIndex().filter(lambda line: line[1] > 0).map(lambda line: line[0].split(","))
testrdd = sc.textFile("wasb:///test-tweets.csv").zipWithIndex().filter(lambda line: line[1] > 0).map(lambda line: line[0].split(","))
fields = [StructField("id", StringType(), True), StructField("label", StringType(), True), StructField("source", StringType(), True), StructField("text", StringType(), True)]
schema = StructType(fields)
training = sqlContext.createDataFrame(trainingrdd, schema)
test = sqlContext.createDataFrame(testrdd, schema)

In [3]:
training = training.withColumn("label", training.label.cast(DoubleType()))
test = training.withColumn("label", training.label.cast(DoubleType()))
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [4]:
model = pipeline.fit(training)

In [5]:
prediction = model.transform(test)
selected = prediction.select("id", "label", "text", "probability", "prediction")
selected.show(500)

+----+-----+--------------------+--------------------+----------+
|  id|label|                text|         probability|prediction|
+----+-----+--------------------+--------------------+----------+
| 942|  0.0|jam is awful #hateit|[0.89738698426540...|       0.0|
|1794|  0.0|cheese is terribl...|[0.89934123345215...|       0.0|
|1796|  0.0|this book is real...|[0.93256977705433...|       0.0|
| 271|  0.0|I loathe rock mus...|[0.89056217998557...|       0.0|
| 860|  1.0|this team is fant...|[0.10468963005486...|       1.0|
|1764|  0.0|skiing is awful #...|[0.90084827103736...|       0.0|
|1017|  1.0|this game is grea...|[0.10082277360322...|       1.0|
|1421|  1.0|rock music is fan...|[0.09478252531610...|       1.0|
|1105|  0.0|that movie is awf...|[0.89593414397013...|       0.0|
| 263|  0.0|We loathe this ga...|[0.89690447685286...|       0.0|
|1039|  1.0|this team is fant...|[0.10503006239280...|       1.0|
|1032|  1.0|this band is grea...|[0.11008763080233...|       1.0|
| 134|  0.