In [1]:
#--create and train a basic spam classifier
#--NLP data-preprocessing
#--naive bayes model training and test

In [2]:
#--connect colab to google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
#--import required libraries

import pandas as pd

import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [4]:
#--import data and read the dataset
data_file_path="/content/gdrive/MyDrive/Learning_AI/NLP/dataset/SMSSpamCollection"

data=pd.read_csv(data_file_path, sep="\t",names=['labels','messages'])
# data=pd.read_csv(data_file_path)

data.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer=WordNetLemmatizer()

In [9]:
#--text preprocessing

def text_preprocessing(sentences):

  '''
  perform text preprocessing on given list of sentences
  input: sentences -> list of sentences
  output: corpus -> list of processed sentences
  '''
  corpus=[]
  for i in range(len(sentences)):
    #--remove unwanted words
    review=re.sub('[^a-zA-Z]',' ',sentences[i])
    #--convert to lower cases
    review=review.lower()
    #--get the individual words
    review=review.split()
    #--apply stemming or lemmatization
    review=[lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

  return corpus

In [11]:
messages_list=list(data['messages'])

corpus=text_preprocessing(messages_list)
# corpus

In [13]:
#--apply bag of words
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(max_features=3000)
X=cv.fit_transform(corpus).toarray()

In [14]:
X.shape

(5572, 3000)

In [19]:
#--convert labels to class
y=pd.get_dummies(data['labels'])['ham'].values
y.shape

(5572,)

In [20]:
#--split into train/test

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [22]:
#--build a basic naive bayes model
from sklearn.naive_bayes import MultinomialNB
nb_model=MultinomialNB().fit(X_train,y_train)

In [23]:
#--perform inference on test dataset
y_pred=nb_model.predict(X_test)

In [24]:
#--compute confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[151,   9],
       [  9, 946]])

In [25]:
#--compute validation accuracy
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,y_pred)
acc

0.9838565022421525