<a href="https://colab.research.google.com/github/maryamshahani/SMS_SpamFilterBeginners/blob/main/SMS_SpamFilterBeginners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#connect to google drive:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#dataset file path:
file_url = '/content/drive/MyDrive/GoogleColabFiles/SMSspam.csv'

In [3]:
#Loading DataSet
import pandas as pd
data = pd.read_csv(file_url, encoding='latin-1')
from termcolor import colored
print(colored("\nDATASETS WERE SUCCESFULLY LOADED...", color="green", attrs=["bold", "dark"]))

[2m[1m[32m
DATASETS WERE SUCCESFULLY LOADED...[0m


In [4]:
#Look at DataSet
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
#rename DataSet Columns:
data.rename(columns = {"v1": "label", "v2": "sms"}, inplace = True)
#drop unnecessary columns
data.drop(["Unnamed: 2","Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)
#Look at DataSet
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data["sms"]

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [7]:
#Check whether there are duplicated values in dataset
print(data.duplicated())
print("***************************")
print(data.duplicated().sum())
print("There are {} duplicated values in the dataset".format(data.duplicated().sum()))

0       False
1       False
2       False
3       False
4       False
        ...  
5567    False
5568    False
5569    False
5570    False
5571    False
Length: 5572, dtype: bool
***************************
403
There are 403 duplicated values in the dataset


In [8]:
#Drop duplicated values from the dataset
data.drop_duplicates(inplace = True)

In [9]:
#Look class frequencies of 'label' variable
data.groupby("label").count()

Unnamed: 0_level_0,sms
label,Unnamed: 1_level_1
ham,4516
spam,653


In [10]:
#Check whether there are 'nan' values
data.isnull().sum()

label    0
sms      0
dtype: int64

In [11]:
#Delete numbers from texts
data["sms"] = data["sms"].str.replace('\d','')
#Remove URLs from texts if there is any
import re, string, unicodedata
data["sms"] = data["sms"].apply(lambda x: " ".join(re.sub(r'http\S+', '', x) for x in x.split()))
#Remove words less than 3 letters long
data["sms"] = data["sms"].apply(lambda x: ' '.join([x for x in x.split() if len(x) > 3]))
#Look at the latest condition of the dataset
data.head(n=10)

  


Unnamed: 0,label,sms
0,ham,"until jurong point, crazy.. Available only bug..."
1,ham,lar... Joking oni...
2,spam,Free entry wkly comp final tkts Text receive e...
3,ham,early hor... already then say...
4,ham,"don't think goes usf, lives around here though"
5,spam,FreeMsg there darling it's been week's word ba...
6,ham,Even brother like speak with They treat like a...
7,ham,your request 'Melle Melle (Oru Minnaminunginte...
8,spam,WINNER!! valued network customer have been sel...
9,spam,your mobile months more? entitled Update lates...


In [12]:
#removing stopwords and punctuations from our dataset
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation
print(stopwords[:5])
print(punctuation)
print("******************************")

def pre_process(sms):
  lowercase_remove_punkt = "".join([char.lower() for char in sms if char not in punctuation])
  tokenize = nltk.tokenize.word_tokenize(lowercase_remove_punkt)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords

#adding a column to our data with our processed messages 
data["processed"] = data["sms"].apply(lambda x : pre_process(x))
data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
******************************


Unnamed: 0,label,sms,processed
0,ham,"until jurong point, crazy.. Available only bug...","[jurong, point, crazy, available, bugis, great..."
1,ham,lar... Joking oni...,"[lar, joking, oni]"
2,spam,Free entry wkly comp final tkts Text receive e...,"[free, entry, wkly, comp, final, tkts, text, r..."
3,ham,early hor... already then say...,"[early, hor, already, say]"
4,ham,"don't think goes usf, lives around here though","[dont, think, goes, usf, lives, around, though]"


In [13]:
data["sms"]

0       until jurong point, crazy.. Available only bug...
1                                    lar... Joking oni...
2       Free entry wkly comp final tkts Text receive e...
3                        early hor... already then say...
4          don't think goes usf, lives around here though
                              ...                        
5567    This time have tried contact have Pound prize....
5568                           Will going esplanade home?
5569         Pity, mood that. So...any other suggestions?
5570    some bitching acted like interested buying som...
5571                                      Rofl. true name
Name: sms, Length: 5169, dtype: object

In [14]:
#tokenized data
data["processed"]

0       [jurong, point, crazy, available, bugis, great...
1                                      [lar, joking, oni]
2       [free, entry, wkly, comp, final, tkts, text, r...
3                              [early, hor, already, say]
4         [dont, think, goes, usf, lives, around, though]
                              ...                        
5567    [time, tried, contact, pound, prize, claim, ea...
5568                             [going, esplanade, home]
5569                     [pity, mood, soany, suggestions]
5570    [bitching, acted, like, interested, buying, so...
5571                                   [rofl, true, name]
Name: processed, Length: 5169, dtype: object

#Categorizing and Counting Tokens (Beginner Alternative for Neural Network)

In [15]:
#categorizing ham/spam associated words
def categorize_words():
  spam_words = []
  ham_words = []
  
  #spam associated words for x in dataProcessed where the label equals to spam
  for x in data['processed'][data['label'] == 'spam']:
    for word in x:
      spam_words.append(word)
  
  #ham associated words for x in dataProcessed where the label equals to ham
  for x in data['processed'][data['label'] == 'ham']:
    for word in x:
      ham_words.append(word)
  #return both lists
  return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:6])
print(ham_words[:6])

['free', 'entry', 'wkly', 'comp', 'final', 'tkts']
['jurong', 'point', 'crazy', 'available', 'bugis', 'great']


#Building a Predict Function

In [16]:
#itterate from all the words from the user input and count their accurances in both spam_words and ham_words
def predict(user_input):
  spam_counter = 0
  ham_counter = 0

  for word in user_input:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)

  print('************RESULTS************')
  #if the message is ham
  if ham_counter > spam_counter:
      accuracy = round((ham_counter / (ham_counter + spam_counter) * 100))
      print('messege is not spam, with {}% certainty'.format(accuracy))
  #if the message could be equally spam and ham
  elif ham_counter == spam_counter:
      print('message could be spam')
  #if the message is spam
  else:
      accuracy = round((spam_counter / (ham_counter + spam_counter)* 100))
      print('message is spam, with {}% certainty'.format(accuracy))


#Collecting User Input & Results
user_input = input("Please type a spam or ham message to check if our function predicts accurately\n") 

#pre-processing the input before prediction
processed_input = pre_process(user_input)

predict(processed_input)  

Please type a spam or ham message to check if our function predicts accurately
CRA is looking 4 you
************RESULTS************
messege is not spam, with 74% certainty




#Note:

---



---



The simplest text encoding (called 'latin-1' or 'iso-8859-1')

---
 sum() method adds all values in each column and returns the sum for each column.

---
format function syntax: string.format(value1,value2,...)

---
In Python strings, the backslash " \ " is a special character, also called the "escape" character. "\t" is a tab, "\n" is a newline, and "\r" is a carriage return.

---
List Comprehension in Python:
https://www.w3schools.com/python/python_lists_comprehension.asp

newlist = [expression for item in oldlist if condition == True]

The return value is a new list, leaving the old list unchanged.

---


 


