Dataset availability: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

### 1. Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
df = spark.read.csv('SMSSpamCollection.csv', inferSchema = True, header=True)
df.printSchema()

root
 |-- v1: string (nullable = true)
 |-- v2: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [3]:
# Column rename

df = df.withColumnRenamed('v1', 'class').withColumnRenamed('v2', 'text')
df.show()

+-----+--------------------+----+----+----+
|class|                text| _c2| _c3| _c4|
+-----+--------------------+----+----+----+
|  ham|Go until jurong p...|null|null|null|
|  ham|Ok lar... Joking ...|null|null|null|
| spam|Free entry in 2 a...|null|null|null|
|  ham|U dun say so earl...|null|null|null|
|  ham|Nah I don't think...|null|null|null|
| spam|FreeMsg Hey there...|null|null|null|
|  ham|Even my brother i...|null|null|null|
|  ham|As per your reque...|null|null|null|
| spam|WINNER!! As a val...|null|null|null|
| spam|Had your mobile 1...|null|null|null|
|  ham|I'm gonna be home...|null|null|null|
| spam|SIX chances to wi...|null|null|null|
| spam|URGENT! You have ...|null|null|null|
|  ham|I've been searchi...|null|null|null|
|  ham|I HAVE A DATE ON ...|null|null|null|
| spam|XXXMobileMovieClu...|null|null|null|
|  ham|Oh k...i'm watchi...|null|null|null|
|  ham|Eh u remember how...|null|null|null|
|  ham|Fine if that��s t...|null|null|null|
| spam|England v Macedon...|null

In [4]:
# Column selection

df = df.select("class","text")
df.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [5]:
# Calculating character length for text 

from pyspark.sql.functions import length

df = df.withColumn('length', length(df['text']))
df.show(3)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
+-----+--------------------+------+
only showing top 3 rows



In [6]:
# Mean character length for each class

df.groupBy('class').mean().show()

+-----+------------------+
|class|       avg(length)|
+-----+------------------+
|  ham| 70.72927488053189|
| spam|138.29312762973353|
+-----+------------------+



In [7]:
# Importing text transformers - Notice their availability in pyspark.ml.feature

from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover, IDF, StringIndexer

In [8]:
# An additional line to define the default language for this spark session as it is sensitive to text processing operations

from pyspark.context import SparkContext as sc

locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

In [9]:
# Tokenize input text
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token_text')

In [10]:
tokenizer.transform(df.select("text")).show(5)

+--------------------+--------------------+
|                text|          token_text|
+--------------------+--------------------+
|Go until jurong p...|[go, until, juron...|
|Ok lar... Joking ...|[ok, lar..., joki...|
|Free entry in 2 a...|[free, entry, in,...|
|U dun say so earl...|[u, dun, say, so,...|
|Nah I don't think...|[nah, i, don't, t...|
+--------------------+--------------------+
only showing top 5 rows



In [11]:
# Remove common words using StopWordsRemover

stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_token')

In [12]:
# The input of this transformation is output from the previous transform, hence connecting in a pipeline format

count_vec = CountVectorizer(inputCol = 'stop_token', outputCol = 'c_vec')

In [13]:
# inverse document frequency

idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')

In [14]:
# Finally transformation of label

ham_spam_to_numeric = StringIndexer(inputCol = 'class', outputCol = 'label')

In [15]:
# Packing all input columns + length of the text message into a single column called  features

from pyspark.ml.feature import VectorAssembler

clean_up = VectorAssembler(inputCols = ['tf_idf', 'length'], outputCol = 'features')

Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable.

In [16]:
# Import the model

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()

In [17]:
# Create a pipeline for transforms

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up])
# pipeline = Pipeline(stages=[tokenizer, stop_remove, count_vec, idf, clean_up])

In [18]:
# Transform the dataset

cleaner = pipeline.fit(df)
clean_df = cleaner.transform(df)

In [27]:
clean_df = clean_df.select('label', 'features')
clean_df.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13223,[7,9,31,60...|
|  0.0|(13223,[0,21,287,...|
|  1.0|(13223,[3,13,23,2...|
+-----+--------------------+
only showing top 3 rows



In [28]:
clean_df.select("features").take(1)

[Row(features=SparseVector(13223, {7: 3.1493, 9: 3.2013, 31: 3.8725, 60: 4.2107, 67: 4.3133, 326: 5.3985, 604: 5.9094, 746: 6.1325, 1381: 6.6715, 1846: 6.8256, 4392: 7.5188, 6944: 7.9243, 8681: 7.9243, 9094: 7.9243, 12055: 7.9243, 12784: 7.9243, 13222: 111.0}))]

In [29]:
# Compare also the schema for df & clean_df to view the input types for the model
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [30]:
clean_df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)



In [31]:
train, test = clean_df.randomSplit([0.7, 0.3])

In [32]:
spam_detector = nb.fit(train)
predictions = spam_detector.transform(test)

In [33]:
predictions.show(5000)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13223,[0,1,5,15,...|[-1004.3774844303...|[1.0,9.2944375755...|       0.0|
|  0.0|(13223,[0,1,5,18,...|[-803.58206695129...|[1.0,1.2192701155...|       0.0|
|  0.0|(13223,[0,1,14,85...|[-692.51517249064...|[1.0,1.1401385612...|       0.0|
|  0.0|(13223,[0,1,17,19...|[-831.79552003251...|[1.0,7.4549125478...|       0.0|
|  0.0|(13223,[0,1,18,26...|[-969.78178866742...|[1.0,1.3545213934...|       0.0|
|  0.0|(13223,[0,1,26,84...|[-1541.2556992049...|[5.05431587239464...|       1.0|
|  0.0|(13223,[0,1,29,11...|[-616.53429764230...|[1.0,5.8982720714...|       0.0|
|  0.0|(13223,[0,1,31,41...|[-341.80998216389...|[1.0,8.8960733581...|       0.0|
|  0.0|(13223,[0,1,41,68...|[-614.06259297224...|[0.99999996050849...|       0.0|
|  0.0|(13223,[0

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()

print("Test Accuracy: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})))

Test Accuracy: 0.9105545617173524


In [37]:
# Evaluator for Multiclass Classification, which expects input columns: prediction, label, weight (optional) and probabilityCol