In [1]:
import findspark
findspark.init('/home/ndane/spark')
import pyspark

In [2]:
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [6]:
df = spark.read.csv('SMSSpamCollection.csv', sep = '\t', inferSchema=True, header = False)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [7]:
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [8]:
df2 = df.withColumnRenamed("_c0","class").withColumnRenamed("_c1","text")
df2.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



Clean and Prepare the Data

In [9]:
import pyspark.sql.functions as F

In [10]:
df3 = df2.withColumn('textLength', F.length('text')).show()

+-----+--------------------+----------+
|class|                text|textLength|
+-----+--------------------+----------+
|  ham|Go until jurong p...|       111|
|  ham|Ok lar... Joking ...|        29|
| spam|Free entry in 2 a...|       155|
|  ham|U dun say so earl...|        49|
|  ham|Nah I don't think...|        61|
| spam|FreeMsg Hey there...|       147|
|  ham|Even my brother i...|        77|
|  ham|As per your reque...|       160|
| spam|WINNER!! As a val...|       157|
| spam|Had your mobile 1...|       154|
|  ham|I'm gonna be home...|       109|
| spam|SIX chances to wi...|       136|
| spam|URGENT! You have ...|       155|
|  ham|I've been searchi...|       196|
|  ham|I HAVE A DATE ON ...|        35|
| spam|XXXMobileMovieClu...|       149|
|  ham|Oh k...i'm watchi...|        26|
|  ham|Eh u remember how...|        81|
|  ham|Fine if thats th...|        56|
| spam|England v Macedon...|       155|
+-----+--------------------+----------+
only showing top 20 rows



In [11]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, StopWordsRemover, IDF, StringIndexer, NGram

In [13]:
tokenizer = Tokenizer(inputCol = "text", outputCol='words')

In [14]:
tokenized = tokenizer.transform(df2)
tokenized.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                                                                                |words                                                                                                                                                                                                                                     |
+-----+-----------------------------------------------------------------------------------------------------------------

In [16]:
remover = StopWordsRemover(inputCol = "words", outputCol = "c_vec")

In [17]:
tokenize_out_sw = remover.transform(tokenized)
tokenize_out_sw.select("c_vec").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|c_vec                                                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go, jurong, point,, crazy.., available, bugis, n, great, world, la, e, buffet..., cine, got, amore, wat...]                                                           |
|[ok, lar..., joking, wif, u, oni...]                                                                                                                                   |
|[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005., text, fa, 87121, receive, entry, question(std, txt, rate)t&c's, apply, 0845

In [18]:
cv = CountVectorizer(inputCol = 'c_vec', outputCol = "c_vector")

In [19]:
model = cv.fit(tokenize_out_sw)
countvector = model.transform(tokenize_out_sw).show(truncate=False)

                                                                                

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|class|text                                                                                                                                                                                                |w