In [1]:
import pyspark

In [2]:
import pandas as pd



In [3]:
import numpy as np

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

In [7]:
from pyspark.ml.classification import LogisticRegression

In [36]:
sms= spark.read.csv('../Datasets/SMS.csv', header=True, inferSchema=True, nullValue='NA', sep=';')

In [9]:
sms.show(5)

+-----+--------------------+-----+
|Index|                Text|Label|
+-----+--------------------+-----+
|    1|Sorry, I'll call ...|    0|
|    2|Dont worry. I gue...|    0|
|    3|Call FREEPHONE 08...|    1|
|    4|Win a 1000 cash p...|    1|
|    5|Go until jurong p...|    0|
+-----+--------------------+-----+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import regexp_replace

In [11]:
sms = sms.withColumn('Text', regexp_replace(sms.Text, '[_():;,.]', ' '))

In [12]:
sms = sms.withColumn('Text', regexp_replace(sms.Text, '[0-9]', ' '))

In [13]:
sms.show(5, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------+-----+
|Index|Text                                                                                                           |Label|
+-----+---------------------------------------------------------------------------------------------------------------+-----+
|1    |Sorry  I'll call later in meeting                                                                              |0    |
|2    |Dont worry  I guess he's busy                                                                                  |0    |
|3    |Call FREEPHONE               now!                                                                              |1    |
|4    |Win a      cash prize or a prize worth                                                                         |1    |
|5    |Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

In [14]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [15]:
sms = Tokenizer(inputCol='Text', outputCol='Text Tokens').transform(sms)

In [16]:
sms.show(3, truncate=False)

+-----+---------------------------------+-----+---------------------------------------------------+
|Index|Text                             |Label|Text Tokens                                        |
+-----+---------------------------------+-----+---------------------------------------------------+
|1    |Sorry  I'll call later in meeting|0    |[sorry, , i'll, call, later, in, meeting]          |
|2    |Dont worry  I guess he's busy    |0    |[dont, worry, , i, guess, he's, busy]              |
|3    |Call FREEPHONE               now!|1    |[call, freephone, , , , , , , , , , , , , , , now!]|
+-----+---------------------------------+-----+---------------------------------------------------+
only showing top 3 rows



In [17]:
stop= StopWordsRemover(inputCol='Text Tokens', outputCol= 'Tokens stop')

In [18]:
sms = stop.transform(sms)

In [19]:
sms.show(2)

+-----+--------------------+-----+--------------------+--------------------+
|Index|                Text|Label|         Text Tokens|         Tokens stop|
+-----+--------------------+-----+--------------------+--------------------+
|    1|Sorry  I'll call ...|    0|[sorry, , i'll, c...|[sorry, , call, l...|
|    2|Dont worry  I gue...|    0|[dont, worry, , i...|[dont, worry, , g...|
+-----+--------------------+-----+--------------------+--------------------+
only showing top 2 rows



In [88]:
from pyspark.ml.feature import HashingTF, IDF

In [21]:
hasher= HashingTF(inputCol='Tokens stop', outputCol='Tokens hash', numFeatures=50)

In [22]:
sms = hasher.transform(sms)

In [23]:
sms.show(1, truncate=False)

+-----+---------------------------------+-----+-----------------------------------------+-------------------------------+-------------------------------------------+
|Index|Text                             |Label|Text Tokens                              |Tokens stop                    |Tokens hash                                |
+-----+---------------------------------+-----+-----------------------------------------+-------------------------------+-------------------------------------------+
|1    |Sorry  I'll call later in meeting|0    |[sorry, , i'll, call, later, in, meeting]|[sorry, , call, later, meeting]|(50,[10,20,22,37,46],[1.0,1.0,1.0,1.0,1.0])|
+-----+---------------------------------+-----+-----------------------------------------+-------------------------------+-------------------------------------------+
only showing top 1 row



In [24]:
idf = IDF(inputCol='Tokens hash', outputCol='features')

In [25]:
sms = idf.fit(sms).transform(sms)

In [26]:
sms.show(4, truncate=False)

+-----+-------------------------------------------+-----+----------------------------------------------------+---------------------------------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|Index|Text                                       |Label|Text Tokens                                         |Tokens stop                                        |Tokens hash                                |features                                                                                                             |
+-----+-------------------------------------------+-----+----------------------------------------------------+---------------------------------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|1    |Sorry  I'll call l

In [27]:
sms_train, sms_test= sms.randomSplit([0.8, 0.2])

In [28]:
logreg= LogisticRegression(labelCol='Label', regParam=0.2)

In [29]:
logreg =logreg.fit(sms_train)

In [30]:
sms_train_log= logreg.transform(sms_train)

In [31]:
sms_train_log.groupby('Label', 'prediction').count().show()

+-----+----------+-----+
|Label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  323|
|    0|       0.0| 3789|
|    1|       1.0|  263|
|    0|       1.0|   29|
+-----+----------+-----+



In [32]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [33]:
binary_evaluator= BinaryClassificationEvaluator(labelCol='Label')

In [34]:
auc= binary_evaluator.evaluate(sms_train_log, {binary_evaluator.metricName: 'areaUnderROC'})

In [35]:
auc

0.9566707995358837

In [37]:
sms.show(2)

+-----+--------------------+-----+
|Index|                Text|Label|
+-----+--------------------+-----+
|    1|Sorry, I'll call ...|    0|
|    2|Dont worry. I gue...|    0|
+-----+--------------------+-----+
only showing top 2 rows



In [81]:
from pyspark.ml import Pipeline

In [82]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [83]:
sms  = sms.withColumn('Text', regexp_replace(sms.Text, '[_,().-?;]', ' '))

In [84]:
sms = sms.withColumn('Text', regexp_replace(sms.Text, '[0-9]', ' '))

In [85]:
tokenizer= Tokenizer(inputCol='Text', outputCol='Text_tokens')

In [86]:
stop_words = StopWordsRemover(inputCol='Text_tokens', outputCol='Text_stop')

In [87]:
Hasher = HashingTF(inputCol='Text_stop', outputCol='Text_hash')

In [89]:
IDF = IDF(inputCol='Text_hash', outputCol='features')

In [90]:
logreg = LogisticRegression(labelCol='Label')

In [91]:
pipeline = Pipeline(stages=[tokenizer, stop_words, Hasher, IDF, logreg])

In [92]:
params = ParamGridBuilder()

In [93]:
params = params.addGrid(Hasher.numFeatures, [1024, 4096, 16384]).addGrid(Hasher.binary, [True, False])

In [94]:
params = params.addGrid(logreg.elasticNetParam, [0.0, 0.5, 1])

In [95]:
params = params.addGrid(logreg.regParam, [0.01, 0.1, 1 ,10]).build()

In [96]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [101]:
evaluator = BinaryClassificationEvaluator(labelCol='Label')

In [102]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps= params, evaluator=evaluator , numFolds=3)

In [103]:
sms_train, sms_test= sms.randomSplit([0.8, 0.2])

In [104]:
scores = cv.fit(sms_train)

In [105]:
scores.avgMetrics

[0.9751875438826991,
 0.9811754337589402,
 0.9826785095221335,
 0.980869218656339,
 0.9738757415256927,
 0.9398181230394758,
 0.5,
 0.5,
 0.9715581062563425,
 0.8781817678005299,
 0.5,
 0.5,
 0.9771470010424326,
 0.9825159727319974,
 0.9835908454194657,
 0.9815695366872039,
 0.977451722564183,
 0.9581489541227295,
 0.5,
 0.5,
 0.9744066323531408,
 0.9367849153275196,
 0.5,
 0.5,
 0.9865449527723238,
 0.9891755856386719,
 0.9907124890009102,
 0.9904277835561638,
 0.9768938926136608,
 0.9422826165856779,
 0.5,
 0.5,
 0.9750921239235215,
 0.8712171458088867,
 0.5,
 0.5,
 0.985703208502408,
 0.9885753688466101,
 0.9901752541338755,
 0.9900423766082309,
 0.9789621161055408,
 0.9611631093618633,
 0.5,
 0.5,
 0.9744118806099786,
 0.9365763653153372,
 0.5,
 0.5,
 0.9917557823075693,
 0.992382531488428,
 0.9922340286552813,
 0.9914650743494082,
 0.9766149130597428,
 0.941936846435304,
 0.5,
 0.5,
 0.9750774240762172,
 0.8920875443488225,
 0.5,
 0.5,
 0.9910008988282544,
 0.9916346461588723,
 0.

In [110]:
best_pipe = scores.bestModel

In [113]:
best_log= best_pipe.stages[4]

In [115]:
best_log.extractParamMap()

{Param(parent='LogisticRegression_dbec1ad5e05b', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_dbec1ad5e05b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_dbec1ad5e05b', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',
 Param(parent='LogisticRegression_dbec1ad5e05b', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LogisticRegression_dbec1ad5e05b', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_dbec1ad5e05b', name='labelCol', doc='label column name.'): 'Label',
 Param(parent='LogisticRegression_dbec1ad5e05b', name='maxBlockSizeInMB', doc='maximum memory in MB for s

In [116]:
pred_test = best_pipe.transform(sms_test)

In [117]:
evaluator.evaluate(pred_test)

0.9966444239914097

In [118]:
spark.stop()