# Build a spam classifier_Logistic_regression

In [1]:
# makes Spark functionality available in the Python interpreter
import pyspark
pyspark.__version__

from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('first_spark_application').getOrCreate()

spark

In [15]:
#simly reading a csv file 

import os 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

filename = os.path.join(*["..","datasets","sms.csv"])

schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])


# Read data from CSV file
sms = spark.read.csv(filename, sep=";", schema=schema)

# Get number of records
print("The data contain %d records." % sms.count())

# View the first five records
sms.show(5)

# Check column data types
sms.dtypes

The data contain 5574 records.
+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
+---+--------------------+-----+
only showing top 5 rows



[('id', 'int'), ('text', 'string'), ('label', 'int')]

In [16]:
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
sms = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
sms = sms.withColumn('text', regexp_replace(sms.text,'[0-9]', ' '))

# Merge multiple spaces
sms = sms.withColumn('text', regexp_replace(sms.text, ' +', ' '))

# # Split the text into words
sms = Tokenizer(inputCol='text', outputCol="words").transform(sms)

sms.show(4, truncate=False)

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry I'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry I guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+------------------------------------------+
only showing top 4 rows



The next steps will be to remove stop words and then apply the hashing trick, converting the results into a TF-IDF.

A quick reminder about these concepts:

The hashing trick provides a fast and space-efficient way to map a very large (possibly infinite) set of items (in this case, all words contained in the SMS messages) onto a smaller, finite number of values.

The TF-IDF matrix reflects how important a word is to each document. It takes into account both the frequency of the word within each document but also the frequency of the word across all of the documents in the collection.

In [18]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
sms = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
sms = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(sms)

# # Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(sms).transform(sms)
      
tf_idf.select('terms', 'features').show(4, truncate=False)

+--------------------------------+----------------------------------------------------------------------------------------------------+
|terms                           |features                                                                                            |
+--------------------------------+----------------------------------------------------------------------------------------------------+
|[sorry, call, later, meeting]   |(1024,[138,344,378,1006],[2.2391682769656747,2.892706319430574,3.684405173719015,4.244020961654438])|
|[dont, worry, guess, busy]      |(1024,[53,233,329,858],[4.618714411095849,3.557143394108088,4.618714411095849,4.937168142214383])   |
|[call, freephone]               |(1024,[138,396],[2.2391682769656747,3.3843005812686773])                                            |
|[win, cash, prize, prize, worth]|(1024,[31,69,387,428],[3.7897656893768414,7.284881949239966,4.4671645129686475,3.898659777615979])  |
+--------------------------------+--------------

In [21]:
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Split the data into training and testing sets
sms_train, sms_test = tf_idf.randomSplit([0.8, 0.2], seed=23)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   35|
|    0|       0.0|  954|
|    1|       1.0|  105|
|    0|       1.0|    2|
+-----+----------+-----+

