In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('nuveo_nlp').getOrCreate()

In [10]:
data_tr = spark.read.csv("TrainingSet/sms-hamspam-train.csv",inferSchema=True,sep='\t')
data_tr = data_tr.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

In [11]:
data_tr.show(4)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
| spam|SMSSERVICES. for ...|
| spam|25p 4 alfie Moon'...|
| spam|U have a secret a...|
+-----+--------------------+
only showing top 4 rows



In [17]:
data_tr = data_tr.withColumn('length',length(data_tr['text']))
data_tr.show(4)

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.24658536585366|
| spam|         138.6416|
+-----+-----------------+

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
| spam|SMSSERVICES. for ...|   156|
| spam|25p 4 alfie Moon'...|   161|
| spam|U have a secret a...|   147|
+-----+--------------------+------+
only showing top 4 rows



In [18]:
# check avg lenght
data_tr.groupby('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.24658536585366|
| spam|         138.6416|
+-----+-----------------+



In [22]:
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

In [24]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

In [26]:
# Use defaults
nb = NaiveBayes()

In [28]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,clean_up])

In [30]:
cleaner = data_prep_pipe.fit(data_tr)

In [31]:
clean_data = cleaner.transform(data_tr)

In [32]:
clean_data = clean_data.select(['label','features'])

In [33]:
clean_data.show(4)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(12089,[7,10,29,6...|
|  1.0|(12089,[20,49,203...|
|  1.0|(12089,[2,4,9,27,...|
|  1.0|(12089,[0,1,25,41...|
+-----+--------------------+
only showing top 4 rows



In [34]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [35]:
spam_predictor = nb.fit(training)

In [36]:
test_results = spam_predictor.transform(testing)

In [37]:
test_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(12089,[0,1,2,14,...|[-615.36208587777...|[0.99999999685670...|       0.0|
|  0.0|(12089,[0,1,3,8,1...|[-562.10487080939...|[1.0,9.2669330051...|       0.0|
|  0.0|(12089,[0,1,7,15,...|[-650.14757948880...|[1.0,1.3561378945...|       0.0|
|  0.0|(12089,[0,1,8,12,...|[-543.84003938220...|[1.0,8.8246566033...|       0.0|
|  0.0|(12089,[0,1,8,12,...|[-543.84003938220...|[1.0,8.8246566033...|       0.0|
|  0.0|(12089,[0,1,11,34...|[-436.82793018813...|[1.0,3.2950826669...|       0.0|
|  0.0|(12089,[0,1,12,29...|[-219.41617000765...|[1.0,3.9530198678...|       0.0|
|  0.0|(12089,[0,1,24,29...|[-334.22512142821...|[1.0,5.2067137738...|       0.0|
|  0.0|(12089,[0,1,32,12...|[-604.79534853730...|[1.0,4.2349843279...|       0.0|
|  0.0|(12089,[0

In [39]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))

Accuracy of model at predicting spam was: 0.9139037298118139
