Dataset availability: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

### 1. Import libraries

In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [18]:
df = spark.read.csv('SMSSpamCollection.csv', inferSchema = True, header=True)
df.printSchema()

root
 |-- v1: string (nullable = true)
 |-- v2: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)



In [19]:
df = df.withColumnRenamed('v1', 'class').withColumnRenamed('v2', 'text')
df.show()

+-----+--------------------+----+----+----+
|class|                text| _c2| _c3| _c4|
+-----+--------------------+----+----+----+
|  ham|Go until jurong p...|null|null|null|
|  ham|Ok lar... Joking ...|null|null|null|
| spam|Free entry in 2 a...|null|null|null|
|  ham|U dun say so earl...|null|null|null|
|  ham|Nah I don't think...|null|null|null|
| spam|FreeMsg Hey there...|null|null|null|
|  ham|Even my brother i...|null|null|null|
|  ham|As per your reque...|null|null|null|
| spam|WINNER!! As a val...|null|null|null|
| spam|Had your mobile 1...|null|null|null|
|  ham|I'm gonna be home...|null|null|null|
| spam|SIX chances to wi...|null|null|null|
| spam|URGENT! You have ...|null|null|null|
|  ham|I've been searchi...|null|null|null|
|  ham|I HAVE A DATE ON ...|null|null|null|
| spam|XXXMobileMovieClu...|null|null|null|
|  ham|Oh k...i'm watchi...|null|null|null|
|  ham|Eh u remember how...|null|null|null|
|  ham|Fine if that��s t...|null|null|null|
| spam|England v Macedon...|null

In [20]:
df = df.select("class","text")
df.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import length

df = df.withColumn('length', length(df['text']))
df.show(3)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
+-----+--------------------+------+
only showing top 3 rows



In [22]:
df.groupBy('class').mean().show()

+-----+------------------+
|class|       avg(length)|
+-----+------------------+
|  ham| 71.07357512953368|
| spam|138.45917001338688|
+-----+------------------+



In [23]:
from pyspark.ml.feature import (CountVectorizer, Tokenizer, 
                                StopWordsRemover, IDF, StringIndexer)

In [24]:
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token_text')
stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_token')
count_vec = CountVectorizer(inputCol = 'stop_token', outputCol = 'c_vec')
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol = 'class', outputCol = 'label')

In [25]:
from pyspark.ml.feature import VectorAssembler

clean_up = VectorAssembler(inputCols = ['tf_idf', 'length'], outputCol = 'features')

In [26]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()

In [27]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[ham_spam_to_numeric, tokenizer, stop_remove, count_vec, idf, clean_up])

In [28]:
cleaner = pipeline.fit(df)
clean_df = cleaner.transform(df)
clean_df = clean_df.select('label', 'features')
clean_df.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13377,[7,10,31,6...|
|  0.0|(13377,[0,23,293,...|
|  1.0|(13377,[2,13,19,2...|
+-----+--------------------+
only showing top 3 rows



In [33]:
clean_df.take(1)

[Row(label=0.0, features=SparseVector(13377, {7: 3.1204, 10: 3.2096, 31: 3.8466, 61: 4.2068, 67: 4.3216, 330: 5.4068, 617: 5.9176, 754: 6.1408, 1398: 6.6798, 1575: 6.8339, 4473: 7.5271, 5777: 7.9325, 8797: 7.9325, 9213: 7.9325, 10867: 7.9325, 12202: 7.9325, 13376: 111.0}))]

In [29]:
train, test = clean_df.randomSplit([0.7, 0.3])

In [30]:
df.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [31]:
spam_detector = nb.fit(train)
predictions = spam_detector.transform(test)
predictions.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13377,[0,1,2,7,8...|[-806.01404823700...|[1.0,2.6730650731...|       0.0|
|  0.0|(13377,[0,1,2,13,...|[-608.29090588755...|[1.0,1.2205753842...|       0.0|
|  0.0|(13377,[0,1,5,15,...|[-1005.1628823058...|[1.0,4.4213374448...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
print("Test Accuracy: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})))

Test Accuracy: 0.9026284348864994
