In [1]:
#Goal: You need to create a model that can predict whether the given SMS is a Spam or HAM sms

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('./Datasets/SMSSpamCollection',sep='\t',names=['label','sms'])

In [4]:
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   sms     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
#If there exists any missing value in your PURE STRING feature col, simply DELETE that record

In [7]:
#Seperate data as features and label
label = data.iloc[:,[0]].values
features = data.iloc[:,[1]].values


# Demo Steps

In [8]:
# Perform text preprocessing
# Your creativity can be applied considering the steps resonates with the dataset.
#
# Text Preprocessing:
# 1. Remove Punctuation
# 2. Extract words out of the sentence
# 3. Normalize words in lowercase
# 4. Remove stopwords
# ....

In [9]:
#Punctuation Removal
import string
text = "Welcome to Simplilearn! You are using NLP for feature generation !"

processedText = ''.join([char for char in text if char not in string.punctuation])
processedText

'Welcome to Simplilearn You are using NLP for feature generation '

In [10]:
#Seperate words from string and normalize it
processedWords = [word.lower() for word in processedText.split(" ")]
processedWords

['welcome',
 'to',
 'simplilearn',
 'you',
 'are',
 'using',
 'nlp',
 'for',
 'feature',
 'generation',
 '']

In [11]:
#Remove Stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
filteredWords = [word for word in processedWords if word not in stopwords.words('english')]
filteredWords

['welcome', 'simplilearn', 'using', 'nlp', 'feature', 'generation', '']

# Using Scikit Learn

In [13]:
import sklearn
sklearn.__version__

'1.2.2'

In [14]:
#Function for basic textPreprocessing

import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer


In [15]:
def textPreprocessing(document):
  #Remove Punctuations
  processedText = ''.join([char for char in document if char not in string.punctuation])
  #Seperate words from sentences and normalize it in lowercase
  processedWords = [word.lower() for word in processedText.split(" ")]
  #Generate Vocab by removing stopwords from above
  filteredWords = [word for word in processedWords if word not in stopwords.words('english')]
  #Return Vocab
  return filteredWords

In [16]:
# Test our preprocessing method 
prep_test = textPreprocessing("Welcome to Simplilearn! You are using NLP for feature generation !")
prep_test

['welcome', 'simplilearn', 'using', 'nlp', 'feature', 'generation', '']

In [17]:
#Create BOW using Sklearn
# CountVectorizer

wordVector = CountVectorizer(analyzer=textPreprocessing)

#Build Vocab from our dataset
finalWordVectorVocab = wordVector.fit(features)

In [18]:
len(finalWordVectorVocab.vocabulary_)

13431

In [19]:
finalWordVectorVocab.vocabulary_

{'go': 5384,
 'jurong': 6668,
 'point,': 9179,
 'crazy..': 3570,
 'available': 2054,
 'bugis': 2688,
 'n': 8083,
 'great': 5519,
 'world': 12981,
 'la': 6873,
 'e': 4291,
 'buffet...': 2686,
 'cine': 3173,
 'got': 5476,
 'amore': 1736,
 'wat...': 12614,
 'ok': 8536,
 'lar...': 6929,
 'joking': 6625,
 'wif': 12826,
 'u': 12149,
 'oni...': 8609,
 'free': 5076,
 'entry': 4493,
 '2': 749,
 'wkly': 12909,
 'comp': 3350,
 'win': 12846,
 'fa': 4686,
 'cup': 3640,
 'final': 4862,
 'tkts': 11813,
 '21st': 786,
 'may': 7565,
 '2005.': 775,
 'text': 11542,
 '87121': 1257,
 'receive': 9704,
 'question(std': 9541,
 'txt': 12125,
 "rate)t&c's": 9615,
 'apply': 1855,
 "08452810075over18's": 370,
 'dun': 4273,
 'say': 10193,
 'early': 4309,
 'hor...': 6016,
 'c': 2760,
 'already': 1690,
 'say...': 10197,
 'nah': 8095,
 'think': 11679,
 'goes': 5411,
 'usf,': 12350,
 'lives': 7184,
 'around': 1925,
 'though': 11716,
 'freemsg': 5095,
 'hey': 5857,
 'darling': 3734,
 '3': 880,
 "week's": 12688,
 'word':

In [20]:
#To create BOW
bagOfWords = finalWordVectorVocab.transform(features)

In [21]:
bagOfWords

<5572x13431 sparse matrix of type '<class 'numpy.int64'>'
	with 53461 stored elements in Compressed Sparse Row format>

In [22]:
#Apply TF IDF on BOW to create feature Col

from sklearn.feature_extraction.text import TfidfTransformer
tfIDFObject = TfidfTransformer().fit(bagOfWords)

In [23]:
processedFeatureCol = tfIDFObject.transform(bagOfWords)

In [24]:
# Creaete Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(processedFeatureCol,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=6)


In [25]:
#Build Model using LogisticRegression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [26]:
#Check Quality of the Model

print(f"Training Score is {model.score(X_train,y_train)} and Testing score is {model.score(X_test,y_test)}")

Training Score is 0.9584922593672874 and Testing score is 0.9587443946188341


In [27]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(label,model.predict(processedFeatureCol)))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      4825
        spam       1.00      0.69      0.82       747

    accuracy                           0.96      5572
   macro avg       0.98      0.85      0.90      5572
weighted avg       0.96      0.96      0.96      5572



In [28]:
#Deploy the model

#input

smsInput = input("Enter SMS: ")

#preprocessing

preprocessedFeature = textPreprocessing(smsInput)

#BOW

bowFeature = finalWordVectorVocab.transform(preprocessedFeature)

#TFIDF

actualFeature = tfIDFObject.transform(bowFeature)

#Predict

predLabel = model.predict(actualFeature)

print(predLabel[0])

spam
