## Step 1: Data Gathering

In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setAppName("project")
conf.setMaster("local[*]")
sc = SparkContext(conf=conf)

In [2]:
rdd_data = sc.textFile("spam_utf.csv")
rdd_tuple = rdd_data.map(lambda x: x.split(','))
rdd_tuple.take(2)

[['v1', 'v2', '', '', ''],
 ['ham',
  '"Go until jurong point',
  ' crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."',
  '',
  '',
  '']]

In [3]:
#replacing comma separator with semicolon
import csv
text=[]
with open("spam_utf.csv", mode="r") as infile:
    reader = csv.reader(infile, delimiter=',')
    with open("spam_utf_semicolon.txt", mode="w") as outfile:
        for row in reader:
            writer = csv.writer(outfile, delimiter='<')
            text.append(row)
        writer.writerows(text)

In [4]:
#creating rdd_tuple by splitting by semicolon
rdd_data = sc.textFile("spam_utf_semicolon.txt")
rdd_tuple = rdd_data.map(lambda x: x.split('<'))
rdd_tuple.take(2)

[['v1', 'v2', '', '', ''],
 ['ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  '',
  '',
  '']]

In [5]:
#removing 2-3-4 columns
rdd_tuple = rdd_tuple.map(lambda row: row[0:2])
rdd_tuple.take(10)

[['v1', 'v2'],
 ['ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'],
 ['ham', 'Ok lar... Joking wif u oni...'],
 ['spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
 ['ham', 'U dun say so early hor... U c already then say...'],
 ['ham', "Nah I don't think he goes to usf, he lives around here though"],
 ['spam',
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv"],
 ['ham',
  'Even my brother is not like to speak with me. They treat me like aids patent.'],
 ['ham',
  "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"],
 ['spam',
  'WINNER!! As a valued network customer you have been select

## Step 2: Data Exploration


In [6]:
#how many sms are there?
rdd_tuple.count()

5575

In [7]:
#grouping by class
list = rdd_tuple.map(lambda x: (x[0], 1)) \
                        .reduceByKey(lambda x,y: x+y) \
                        .takeOrdered(10, (lambda x: x[1])) 
list

[('v1', 1), ('ham"""', 2), ('spam', 747), ('ham', 4825)]

In [8]:
#replacing ham""" with ham
def replace(row):
    if row[0]=='ham"""':
        row[0]='ham'
    return row


list = rdd_tuple.map(lambda x: replace(x),1)\
                    .map(lambda x: (x[0], 1)) \
                    .reduceByKey(lambda x,y: x+y) 
list.collect()


[('v1', 1), ('ham', 4827), ('spam', 747)]

In [9]:
#removing header from rdd
header = rdd_tuple.take(1)
rdd_tuple = rdd_tuple.filter(lambda line : line not in header)
rdd_tuple.take(5)

[['ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'],
 ['ham', 'Ok lar... Joking wif u oni...'],
 ['spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
 ['ham', 'U dun say so early hor... U c already then say...'],
 ['ham', "Nah I don't think he goes to usf, he lives around here though"]]

In [10]:
# most common words in spam sms
common_word_spam = rdd_tuple.filter(lambda x: x[0] == "spam")\
                            .flatMap(lambda x: [(w,1) for w in x[1].split(' ')]) \
                            .reduceByKey(lambda x,y: x+y)\
                            .takeOrdered(50, (lambda x: -x[1]))
common_word_spam

[('to', 601),
 ('a', 358),
 ('or', 185),
 ('call', 183),
 ('your', 183),
 ('the', 177),
 ('2', 169),
 ('for', 168),
 ('you', 164),
 ('is', 140),
 ('Call', 134),
 ('on', 133),
 ('have', 125),
 ('and', 117),
 ('from', 111),
 ('ur', 107),
 ('with', 101),
 ('&', 98),
 ('of', 93),
 ('4', 92),
 ('FREE', 88),
 ('mobile', 81),
 ('are', 77),
 ('our', 75),
 ('claim', 73),
 ('To', 73),
 ('You', 72),
 ('U', 70),
 ('Your', 69),
 ('txt', 68),
 ('text', 68),
 ('in', 64),
 ('now', 64),
 ('Txt', 63),
 ('reply', 58),
 ('free', 56),
 ('contact', 56),
 ('-', 53),
 ('be', 48),
 ('now!', 48),
 ('u', 47),
 ('just', 46),
 ('send', 45),
 ('Nokia', 45),
 ('won', 45),
 ('get', 45),
 ('only', 44),
 ('this', 44),
 ('per', 44),
 ('prize', 43)]

In [11]:
# most common words in non-spam ham sms
common_word_spam = rdd_tuple.filter(lambda x: x[0] == "ham")\
                            .flatMap(lambda x: [(w,1) for w in x[1].split(' ')]) \
                            .reduceByKey(lambda x,y: x+y)\
                            .takeOrdered(10, (lambda x: -x[1]))
common_word_spam

[('to', 1530),
 ('you', 1458),
 ('I', 1435),
 ('the', 1019),
 ('a', 969),
 ('and', 738),
 ('i', 736),
 ('in', 734),
 ('u', 645),
 ('is', 638)]

In [12]:
# most common correlated words in spam sms
def line_to_bigram(line):
    words = line.lower().split(" ") #lower case text
    return [(words[i-1], words[i]) for i in range(1, len(words))]


common_word_spam = rdd_tuple.filter(lambda x: x[0] == "spam")\
                            .flatMap(lambda x: line_to_bigram(x[1]))\
                            .map(lambda x: (x, 1)) \
                            .reduceByKey(lambda x,y: x+y) \
                            .takeOrdered(10, (lambda x: -x[1])) 

common_word_spam

[(('you', 'have'), 69),
 (('have', 'won'), 52),
 (('your', 'mobile'), 46),
 (('please', 'call'), 42),
 (('won', 'a'), 40),
 (('to', 'claim'), 39),
 (('this', 'is'), 38),
 (('to', 'contact'), 37),
 (('you', 'are'), 35),
 (('cash', 'or'), 27)]

In [13]:
# most common correlated words in ham sms
def line_to_bigram(line):
    words = line.lower().split(" ")
    return [(words[i-1], words[i]) for i in range(1, len(words))]


common_word_spam = rdd_tuple.filter(lambda x: x[0] == "ham")\
                            .flatMap(lambda x: line_to_bigram(x[1]))\
                            .map(lambda x: (x, 1)) \
                            .reduceByKey(lambda x,y: x+y) \
                            .takeOrdered(10, (lambda x: -x[1])) 

common_word_spam

[(('', '&lt;#&gt;'), 221),
 (('are', 'you'), 149),
 (('&lt;#&gt;', ''), 132),
 (('i', 'am'), 127),
 (('have', 'a'), 108),
 (('i', 'will'), 97),
 (('do', 'you'), 89),
 (('you', 'are'), 88),
 (('in', 'the'), 86),
 (('i', 'have'), 82)]

In [14]:
# understanding special character ""&lt;#&gt"
# how many sms with special char?
common_word_spam = rdd_tuple.filter(lambda x: x[0] == "ham")\
                            .flatMap(lambda x: [(w,1) for w in x[1].split(' ')]) \
                            .filter(lambda x: x[0] == "&lt;#&gt;")\
                            .reduceByKey(lambda x,y: x+y) 
common_word_spam.collect()

[('&lt;#&gt;', 276)]

In [28]:
rdd_tuple.take(1)

[['ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']]

In [29]:
rdd_tuple_2 = rdd_tuple.map(lambda x: (x[0],x[1],x[1]))
rdd_tuple_2.take(1)

[('ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')]

In [35]:
import re

pattern = re.compile("[^a-z0-9 ]+")
clean_rdd = rdd_tuple.map(lambda x: (x[0],x[1],pattern.sub(' ', x[1].lower())))
clean_rdd.take(1)

[('ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'go until jurong point  crazy  available only in bugis n great world la e buffet  cine there got amore wat ')]

In [17]:
#removing STOPWORDS
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/hpsa10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [79]:
STOPWORDS = set(stopwords.words('english'))
more_stops = {'ur','n','u','r'}
print(more_stops)
STOPWORDS = STOPWORDS.union(more_stops)
STOPWORDS

{'u', 'ur', 'n', 'r'}


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'n',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own

In [80]:
new_rdd = clean_rdd.map(lambda x: (x[0],x[1],[w for w in x[2].split() if w not in STOPWORDS]))
new_rdd.take(2)

[('ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  ['go',
   'jurong',
   'point,',
   'crazy',
   'available',
   'bugis',
   'great',
   'world',
   'la',
   'e',
   'buffet',
   'cine',
   'got',
   'amore',
   'wat']),
 ('ham',
  'Ok lar... Joking wif u oni...',
  ['ok', 'lar', 'joking', 'wif', 'oni'])]

In [51]:
def punctF(line):
    letters = [chr(x) for x in range(65,91)]
    numbers = [chr(x) for x in range(48,58)]
    count=0
    for letter in line:
        if letter.upper() not in letters and letter.upper() not in numbers and letter.upper() != ' ':
            count+=1
    return count
line='I+++[sono]cccc     .'
print(punctF(line))

6


In [81]:
#Count of punctuation

new_rdd_2=new_rdd.map(lambda y: (y[0],y[1],punctF(y[1]),y[2]))

 
new_rdd_2.take(10)[9]

('spam',
 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
 2,
 ['mobile',
  '11',
  'months',
  'entitled',
  'update',
  'latest',
  'colour',
  'mobiles',
  'camera',
  'free',
  'call',
  'mobile',
  'update',
  'co',
  'free',
  '08002986030'])

In [96]:
def CountCaps(line):
    Caps = [chr(x) for x in range(65,91)]
    count=0
    words = line.split()
    for word in words:
        if len(word)<=1:
            for letter in word:
                if letter in Caps:
                    count+=1
        else:
            if word[1] in Caps:
                for letter in word:
                    if letter in Caps:
                        count+=1
    return count
line='Io SONO A SPARTA'
CountCaps(line)

11

In [98]:
#Aggiungiamo il count della maiuscole
new_rdd_3=new_rdd_2.map(lambda y: (y[0],y[1],y[2],CountCaps(y[1]),y[3]))

 
new_rdd_3.take(10)[8]

('spam',
 'WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 8,
 8,
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
  'â£900',
  'prize',
  'reward',
  'claim',
  'call',
  '09061701461',
  'claim',
  'code',
  'kl341',
  'valid',
  '12',
  'hours'])

In [84]:
line='ciao sono'
len(line)

9

In [87]:
# Adding the len of message in characters

new_rdd_4=new_rdd_3.map(lambda y: (y[0],y[1],len(y[1]),y[2],y[3],y[4]))

 
new_rdd_4.take(10)[3]

('spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 155,
 6,
 10,
 ['free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receive',
  'entry',
  'question(std',
  'txt',
  "rate)t&c's",
  'apply',
  "08452810075over18's"])

In [99]:
Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','bag_of_words']

In [163]:
def CountCapsWords(line):
    Caps = [chr(x) for x in range(65,91)]
    count=0
    words = line.split()
    for word in words:
        if len(word)<=1:
            continue
        caps_count=0
        for letter in word:
            if letter in Caps:
                caps_count+=1
        if caps_count>=2:
            count+=1
    return count
line='Io Sono a Sparta!!!!!!!!!!!!!!!!!! A'
CountCapsWords(line)

0

In [164]:
#counting words in caps lock

#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','bag_of_words']

new_rdd_5=new_rdd_4.map(lambda y: (y[0],y[1],y[2],y[3],y[4],CountCapsWords(y[1]),y[5]))

 
new_rdd_5.take(10)[3]

('ham',
 'U dun say so early hor... U c already then say...',
 49,
 6,
 2,
 0,
 ['dun', 'say', 'early', 'hor', 'c', 'already', 'say'])

In [None]:
#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','bag_of_words']

In [119]:
def isLink(line):
    words = line.lower().split(" ") #lower case text
    count=0
    for letter in words:
        for i in range (len(letter)):
            if len(letter)> 2:
                if letter[i-2]=="w" and letter[i-1] =="w" and letter[i] =="w":
                    count=1
            if len(letter)> 3:
                if letter[i-3]=="h" and letter[i-2] =="t" and letter[i-1] =="t" and letter[i] =="p":
                    count=1
    return count


line='Io sono SPARTA www.ciao.it a'
isLink(line)

1

In [165]:
# Boolean for link

#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','bag_of_words']

new_rdd_6=new_rdd_5.map(lambda y: (y[0],y[1],y[2],y[3],y[4],y[5],isLink(y[1]),y[6]))

 
new_rdd_6.take(10)[9]

('spam',
 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
 154,
 2,
 6,
 1,
 0,
 ['mobile',
  '11',
  'months',
  'entitled',
  'update',
  'latest',
  'colour',
  'mobiles',
  'camera',
  'free',
  'call',
  'mobile',
  'update',
  'co',
  'free',
  '08002986030'])

In [None]:
#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','contains_link','bag_of_words']

In [142]:
rdd_tuple.filter(lambda x: x[0]=='spam').take(30)

[['spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
 ['spam',
  "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv"],
 ['spam',
  'WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'],
 ['spam',
  'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'],
 ['spam',
  'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info'],
 ['spam',
  'URGENT! You have won a 1 week FREE membership in our Â£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A

In [138]:
def CountNum(line):
    numbers = [chr(x) for x in range(48,58)]
    count=0
    for char in line:
        if char in numbers:
            count+=1
    return count
line='0955587/ aaacks  a'
print(CountNum(line))

7


In [166]:
#Counting number of numbers in text

#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','contains_link','bag_of_words']

new_rdd_7=new_rdd_6.map(lambda y: (y[0],y[1],y[2],y[3],y[4],y[5],y[6],CountNum(y[1]),y[7]))

 
new_rdd_7.take(10)[9]

('spam',
 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
 154,
 2,
 6,
 1,
 0,
 13,
 ['mobile',
  '11',
  'months',
  'entitled',
  'update',
  'latest',
  'colour',
  'mobiles',
  'camera',
  'free',
  'call',
  'mobile',
  'update',
  'co',
  'free',
  '08002986030'])

In [None]:
#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','contains_link','num_count','bag_of_words']

In [145]:
def CountQuestionMarks(line):
    count=0
    for char in line:
        if char == '?':
            count+=1
    return count
line='0955587/ aaacks  a?'
print(CountQuestionMarks(line))

1


In [149]:
def CountExclMarks(line):
    count=0
    for char in line:
        if char == '!':
            count+=1
    return count
line='WINN!!!!!   !???'
print(CountExclMarks(line))

6


In [168]:
#counting ! and ?

#Header = ['ham/spam','text','len_ch','punct_count','caps_count_ch','caps_count_words','contains_link','num_count','bag_of_words']

new_rdd_8=new_rdd_7.map(lambda y: (y[0],y[1],y[2],y[3],CountQuestionMarks(y[1]),CountExclMarks(y[1]),y[4],y[5],y[6],y[7],y[8]))

 
new_rdd_8.take(10)[7]

('ham',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 160,
 6,
 0,
 0,
 2,
 0,
 0,
 1,
 ['per',
  'request',
  "'melle",
  'melle',
  '(oru',
  'minnaminunginte',
  'nurungu',
  "vettam)'",
  'set',
  'callertune',
  'callers',
  'press',
  '*9',
  'copy',
  'friends',
  'callertune'])

## Step 3: Data Preparation
