# Spam Detection using NLP

In [1]:
import pandas as pd

In [2]:
# Importing and reading the Dataset
Dataset = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','messages'])

Cleaning the dataset and preprocessing

In [3]:
# Importing all the rquired libraries 
import re                  # This is regular expression library
import nltk                # This is the main library for NLP
nltk.download('stopwords')
from nltk.corpus import stopwords            # This is for removing all unwanted stopwords (Eg:in, a, etc)
from nltk.stem.porter import PorterStemmer   # This is for stemming the words by using its common sound
from nltk.stem import WordNetLemmatizer      # This is same as stemming but lemmatizer will give us an actual meaningful root word


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mufis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Making instances of stemmer and lemmetizer
PS = PorterStemmer()                      # We will be not using PorterStemmer in this implementation            
Lem = WordNetLemmatizer()

In [5]:
# Cleaning & Preprocessing (Using WordNetLemmatizer)
Corpus = []                                                    # Putting the cleaned and preprocessed dataset inside corpus 
for i in range(0, len(Dataset)):                               # specifying the entire length of the dataset from 0 record to 5572 record
    review = re.sub('[^a-zA-Z]', ' ', Dataset['messages'][i])  # here ^ means except any charcter between a-z and A-Z replace it with ' '(space) and do it in the messages column of the dataset with 'i' representing each record
    review = review.lower()                                    # this will get all the words to lowercase to avoid duplicates
    review = review.split()                                    # this will split all the words and will give us a list of words to be put in review
    review = [Lem.lemmatize(word) for word in review if not word in stopwords.words('english')]    # here we do lemmatizing on the words present in review but omitting those words that are stopwords.words(english), so this will result in getting the base form of each non-stopword
    review = ' '.join(review)                                  # this will join all the stemmed words (from the previous review) into sentences with ' '(spaces)
    Corpus.append(review)     

In [6]:
# Creating Bag of Words model using CountVectorizer from SKLearn library
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(max_features=5000)                        # Initializing and making an instance of CountVeactorizer

In [7]:
# Fitting the Bag of words model into Corpus and creating the X (independent variable) with only the vectorized feature
X = CV.fit_transform(Corpus).toarray()                         # Transforming the Corpus with CountVEctorizer and converting into an Array object

In [8]:
# After getting our X variable from the Corpus, now we need to get the dependent Y variable from the Dataset which will basically be the Label feature
Y = pd.get_dummies(Dataset['label'], drop_first=True)          # Since the label column was categorical (having ham and spam) we will use get_dummies encoding to convert it to binary representation of 0 and 1

In [9]:
# Since Y is a dataframe, we need to convert it to an array. We will do that using iloc function and just getting the values out of it
Y = Y.iloc[:,:].values

In [10]:
# Splitting into train and test test for modelling
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=0)

In [11]:
# Training the model using Naive Bayes Classifier (Since naive bayes is a classification technique that worls completely on probability, it is a good classifier for NLP)
from sklearn.naive_bayes import MultinomialNB                  # We choose multinomial because it works for any number of classes
MultiNB = MultinomialNB()
spam_detection = MultiNB.fit(X_train, Y_train)                 # Fitting the NB model on the train set (x_train and y_train)

  y = column_or_1d(y, warn=True)


In [12]:
### Predicting on the test set
Y_pred = spam_detection.predict(X_test)                        # Using the trained spam_detection model to predict values in the test set (x_test)

In [13]:
### Now since we have the y_pred, we need to compare the values of y_pred with the values of y_test to see how accurate our model has performed in the test set
# For this, we will see the confusion matrix and accuracy score from sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
conf_matrix = confusion_matrix(Y_pred, Y_test)
accuracy = accuracy_score(Y_pred, Y_test)                      # 0.9842067480258435 (98% accuracy score which is very good)

In [14]:
print(conf_matrix)

[[1198   12]
 [  10  173]]


In [15]:
print(accuracy)

0.9842067480258435
