#### Naive Bayes Spam Classification using Sklearn

#### 1. Load dataset

In [1]:
import pandas as pd

In [4]:
corpus_link = 'datasets/SMSSpamCollection.tsv'
full_corpus = pd.read_csv(corpus_link, sep="\t", header=None)
full_corpus.columns = ['label', 'text']
full_corpus.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


#### 2. Train-Test split

In [9]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(full_corpus['text'], full_corpus['label'])

#### 2. Tokenization

In [5]:
import re
import string

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/max/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/max/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/max/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

def rem_stopwords(sentence: list):
  return [word for word in sentence if word not in stopwords]


def is_fully_digits(text):
  return all([char.isdigit() for char in text])


def clean_text(text: str):
  # Remove punctuation
  cleaned_punct: str = "".join([symbol for symbol in text if symbol not in string.punctuation])
  
  # Implement tokenization
  tokenized: list = re.split("\W+", cleaned_punct.lower())

  cleaned_fully_digits = [word for word in tokenized if not is_fully_digits(word)]

  # Remove stopwords & Lemmatize
  nostopwords_and_lemmatized = [wn.lemmatize(word) for word in cleaned_fully_digits if word not in stopwords]

  return nostopwords_and_lemmatized

#### 3. Vectorization

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [18]:
vect = CountVectorizer(analyzer=clean_text).fit(X_train)
X_vect_train = vect.transform(X_train)

#### 4. Learn Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
model = MultinomialNB(alpha=0.1)
model.fit(X_vect_train, y_train)

MultinomialNB(alpha=0.1)