# Importing Spark

In [1]:
import pyspark

from pyspark import SparkContext
from pyspark.sql import SparkSession

SparkContext.setSystemProperty('spark.executor.memory', '8g')
SparkContext.setSystemProperty('spark.driver.memory', '45G')

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("Python Spark").getOrCreate()

# Loading and preparing the data

In [2]:
from pyspark.sql.functions import monotonically_increasing_id, regexp_replace

# loading and constructing headers
df_spam = spark.read.option("header", True)\
    .csv('./data/spam.csv')\
    .withColumnRenamed("v1", "label")\
    .withColumnRenamed("v2", "sms")\
    .drop('_c2').drop('_c3').drop('_c4')\
    .withColumn("id", monotonically_increasing_id())

In [3]:
df_spam.take(5)

[Row(label='ham', sms='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', id=0),
 Row(label='ham', sms='Ok lar... Joking wif u oni...', id=1),
 Row(label='spam', sms="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", id=2),
 Row(label='ham', sms='U dun say so early hor... U c already then say...', id=3),
 Row(label='ham', sms="Nah I don't think he goes to usf, he lives around here though", id=4)]

In [4]:
from pyspark.ml.feature import HashingTF,Tokenizer,  StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [5]:
# creating the labels as 0 and 1 instead of strings
# and clean sms txt
from pyspark.sql.functions import udf

construct_labels = udf( lambda x: 0.0 if x== 'ham' else 1.0)
# Convert to binary label
_df_spam = df_spam.withColumn("label", construct_labels(df_spam["label"]).cast('float'))
# 0 if not SPAM and 1 if SPAM
## retirer les apostrophes et les guillemets
_df_spam = _df_spam.withColumn("sms", regexp_replace("sms", "[^0-9A-Za-z\\s]", ""))\
                 .na.fill("")
_df_spam.show()

+-----+--------------------+---+
|label|                 sms| id|
+-----+--------------------+---+
|  0.0|Go until jurong p...|  0|
|  0.0|Ok lar Joking wif...|  1|
|  1.0|Free entry in 2 a...|  2|
|  0.0|U dun say so earl...|  3|
|  0.0|Nah I dont think ...|  4|
|  1.0|FreeMsg Hey there...|  5|
|  0.0|Even my brother i...|  6|
|  0.0|As per your reque...|  7|
|  1.0|WINNER As a value...|  8|
|  1.0|Had your mobile 1...|  9|
|  0.0|Im gonna be home ...| 10|
|  1.0|SIX chances to wi...| 11|
|  1.0|URGENT You have w...| 12|
|  0.0|Ive been searchin...| 13|
|  0.0|I HAVE A DATE ON ...| 14|
|  1.0|XXXMobileMovieClu...| 15|
|  0.0|Oh kim watching here| 16|
|  0.0|Eh u remember how...| 17|
|  0.0|Fine if thats the...| 18|
|  1.0|England v Macedon...| 19|
+-----+--------------------+---+
only showing top 20 rows



In [6]:
_df_spam.printSchema()



# clean text messages

root
 |-- label: float (nullable = true)
 |-- sms: string (nullable = false)
 |-- id: long (nullable = false)



In [7]:
# Train test split
trainingData, testData = _df_spam.randomSplit([0.8, 0.2])
trainingData.show()
testData.show()



+-----+--------------------+----+
|label|                 sms|  id|
+-----+--------------------+----+
|  0.0|                    |3376|
|  0.0|                    |4824|
|  0.0|   and  picking t...|5486|
|  0.0| Am on a train ba...|2677|
|  0.0|        Am on my way|2470|
|  0.0|  Are you in the pub|5410|
|  0.0| Thought I didnt ...|3645|
|  0.0| Was a nice day a...|2667|
|  0.0| Will be septembe...|1836|
|  0.0| Will have two mo...|2218|
|  0.0|    all write or wat|4342|
|  0.0| and  picking the...|2766|
|  0.0| and dont worry w...|2572|
|  0.0| anyway many good...|4030|
|  0.0|   but your not here|4575|
|  0.0| came to look at ...|3673|
|  0.0| collecting ur la...|2898|
|  0.0| come lt 25 n pas...|3543|
|  0.0| comin to fetch u...|4152|
|  0.0| dun need to pick...|1174|
+-----+--------------------+----+
only showing top 20 rows

+-----+--------------------+----+
|label|                 sms|  id|
+-----+--------------------+----+
|  0.0| Was really good ...|5034|
|  0.0| Was thinking a

+-----+--------------------+----+
|label|                 sms|  id|
+-----+--------------------+----+
|  1.0|                    |3376|
|  1.0|   and  picking t...|5486|
|  1.0| Am on a train ba...|2677|
|  1.0|  Are you in the pub|5410|
|  1.0| FREE POLYPHONIC ...|4903|
|  1.0| Was a nice day a...|2667|
|  1.0| Was really good ...|5034|
|  1.0| Will be septembe...|1836|
|  1.0| Will have two mo...|2218|
|  1.0| You gonna ring t...|3248|
|  1.0|    all write or wat|4342|
|  1.0| and  picking the...|2766|
|  1.0| anyway many good...|4030|
|  1.0| bot notes oredi ...|4350|
|  1.0|   but your not here|4575|
|  1.0|    called dad oredi|4617|
|  1.0| came to look at ...|3673|
|  1.0| collecting ur la...|2898|
|  1.0| come lt 25 n pas...|3543|
|  1.0| dun need to pick...|1174|
+-----+--------------------+----+
only showing top 20 rows

+-----+--------------------+----+
|label|                 sms|  id|
+-----+--------------------+----+
|  1.0|                    |  99|
|  1.0|               

In [8]:
# Constructing pipeline

tokenizer = Tokenizer ( inputCol="sms" , outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
## Convert to binary label
# pas besoin car on a déjà fait
indexer = StringIndexer().setInputCol('label').setOutputCol('label')
# regression logistique avec pénalités
# fonction parcimonieuse => si elle nbe l'est s'adapte à  toutes les situations possibles
lr = LogisticRegression(maxIter=10, regParam=0.001, elasticNetParam=1.)
pipeline = Pipeline(stages=[tokenizer , hashingTF , lr ,indexer])

In [9]:
trainingData.printSchema()

root
 |-- label: float (nullable = true)
 |-- sms: string (nullable = false)
 |-- id: long (nullable = false)



In [11]:
# Fitting the model
_model = pipeline.fit(trainingData)

IllegalArgumentException: requirement failed: Output column label already exists.

In [41]:
# Evaluation of the model
predictions = _model.transform(testData)

NameError: name 'model' is not defined

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName('areaUnderROC')
AUC = evaluator.evaluate(predictions)
print(AUC)

In [None]:
predictions[['label','sms','prediction','probability','prediction']].filter('label==1').show()

# probabilty :  probabilité qu ce n'est pas un spam