In [21]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [22]:
spark = SparkSession.builder.appName('prez').getOrCreate()
df = spark.read.format("csv").option("header", "true").load("President_condensed_data.csv")
df = df.withColumn("length", length(df.value))
df.show()

+--------------------+----------+----------------+------+
|               value|Presidents|Party_Affliation|length|
+--------------------+----------+----------------+------+
|mr chief justice ...|      bush|      Republican| 12107|
|i have many frien...|      bush|      Republican| 21418|
|my fellow citizen...|      bush|      Republican|  4657|
|thank you governo...|      bush|      Republican| 13952|
|mr speaker mr pre...|      bush|      Republican| 26964|
|mr president mr s...|      bush|      Republican| 20750|
|evan thank you so...|      bush|      Republican| 10314|
|in the life of a ...|      bush|      Republican|  8293|
|mr president and ...|      bush|      Republican| 18690|
|tonight i want to...|      bush|      Republican|  6120|
|just 2 hours ago ...|      bush|      Republican|  8467|
|mr president and ...|      bush|      Republican| 22595|
|kuwait is liberat...|      bush|      Republican|  4402|
|mr president and ...|      bush|      Republican| 15197|
|thank you all

In [23]:
review_data = Tokenizer(inputCol="value", outputCol="Words")
reviewed = review_data.transform(df)

stopwordList = ['mr', 'dr', 'mrs', 'ms', 'chief', 'justice', 'thank you', 'president', 'speaker', 'mayor', 'chairman', 'general',
                'secretary', 'vice', 'good morning', 'good evening', '<laughter>', '<the president>', 'senator', '<applause>']
remover = StopWordsRemover(inputCol="Words", outputCol="filtered", stopWords=stopwordList)
newFrame = remover.transform(reviewed)

hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2,10))
hashed_df = hashing.transform(newFrame)

idf = IDF(inputCol="hashedValues", outputCol="feature")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

rescaledData.select("words", "feature").show(truncate=False)
indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label")
indexed = indexer.fit(rescaledData).transform(rescaledData)
indexed.show()

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

+--------------------+----------+----------------+------+--------------------+--------------------+--------------------+--------------------+-----+
|               value|Presidents|Party_Affliation|length|               Words|            filtered|        hashedValues|             feature|label|
+--------------------+----------+----------------+------+--------------------+--------------------+--------------------+--------------------+-----+
|mr chief justice ...|      bush|      Republican| 12107|[mr, chief, justi...|[quayle, mitchell...|(1024,[1,2,4,5,8,...|(1024,[1,2,4,5,8,...|  1.0|
|i have many frien...|      bush|      Republican| 21418|[i, have, many, f...|[i, have, many, f...|(1024,[0,1,2,3,4,...|(1024,[0,1,2,3,4,...|  1.0|
|my fellow citizen...|      bush|      Republican|  4657|[my, fellow, citi...|[my, fellow, citi...|(1024,[2,4,5,14,1...|(1024,[2,4,5,14,1...|  1.0|
|thank you governo...|      bush|      Republican| 13952|[thank, you, gove...|[thank, you, gove...|(1024,[0,1,2,

In [24]:
assembler = VectorAssembler(
    inputCols=["feature", "length"],
    outputCol="features")

cleaned = assembler.transform(indexed)
print("Assembled 'features', 'length' to vector column 'features'")

Assembled 'features', 'length' to vector column 'features'


In [25]:
training, testing = cleaned.randomSplit([0.7, 0.3])

In [26]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model and fit training data
#nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nb = NaiveBayes()

# train the model
model = nb.fit(training)

# select example rows to display.
predictions = model.transform(testing)
predictions.show()

+--------------------+-----------+----------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|               value| Presidents|Party_Affliation|length|               Words|            filtered|        hashedValues|             feature|label|            features|       rawPrediction|         probability|prediction|
+--------------------+-----------+----------------+------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|by the president ...|    johnson|        Democrat|  8209|[by, the, preside...|[by, the, of, the...|(1024,[0,4,5,8,12...|(1024,[0,4,5,8,12...|  0.0|(1025,[0,4,5,8,12...|[-4185.1584353505...|[1.0,8.0203193440...|       0.0|
|fellow citizens f...|     gwbush|      Republican| 10397|[fellow, citizens...|[fellow, citizens...|(1024,[0

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(predictions)
print("Accuracy of model at predicting reviews was: {}".format(acc))

Accuracy of model at predicting reviews was: 0.7090909090909092


In [28]:
selected = predictions.select("label", "Party_Affliation", "probability", "prediction")
for row in selected.collect():
    rid, Party_Affliation, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, Party_Affliation, str(prob), prediction))

(0, Democrat) --> prob=[1.0,8.020319344027191e-19], prediction=0.000000
(1, Republican) --> prob=[0.0015030161402615641,0.9984969838597385], prediction=1.000000
(0, Democrat) --> prob=[5.031490541061705e-11,0.9999999999496851], prediction=1.000000
(0, Democrat) --> prob=[0.490257125831583,0.509742874168417], prediction=1.000000
(1, Republican) --> prob=[0.9999620614536799,3.7938546320156514e-05], prediction=0.000000
(0, Democrat) --> prob=[0.9999999999999671,3.2867383309839863e-14], prediction=0.000000
(0, Democrat) --> prob=[0.3754678343898874,0.6245321656101127], prediction=1.000000
(0, Democrat) --> prob=[0.9999978596304235,2.1403695764432617e-06], prediction=0.000000
(0, Democrat) --> prob=[3.1977238028189695e-15,0.9999999999999969], prediction=1.000000
(1, Republican) --> prob=[0.0034209037717166566,0.9965790962282833], prediction=1.000000
(1, Republican) --> prob=[2.01105470282172e-15,0.999999999999998], prediction=1.000000
(0, Democrat) --> prob=[1.375801334645768e-12,0.99999999