In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
import nltk
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
le = preprocessing.LabelEncoder()

In [3]:
data = df.to_numpy()

In [4]:
X = data[:, 1]
y = data[:, 0]

In [5]:
X.shape, y.shape

((5572,), (5572,))

In [6]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [7]:
def getStem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]
    clean_review = ' '.join(stemmed_words)
    return clean_review

In [8]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [9]:
stemmed_doc = getDoc(X)

In [None]:
stemmed_doc[:10]

In [11]:
cv = CountVectorizer()

In [12]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [13]:
X = vc.todense()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
# NB from sklearn
from sklearn.naive_bayes import MultinomialNB

In [16]:
model = MultinomialNB()
model.fit(np.asarray(X_train), y_train)
model.score(np.asarray(X_test), y_test)

0.977705274605764

In [17]:
messages = [
    """
    Hi Kunal,
We invite you to participate in MishMash - India’s largest online diversity hackathon.
The hackathon is a Skillenza initiative and sponsored by Microsoft, Unity, Unilever, Gojek, Rocketium and Jharkhand Government.
We have a special theme for you - Deep Tech/Machine Learning - sponsored by Unilever, which will be perfect for you.
    """,
    """Join us today at 12:00 PM ET / 16:00 UTC for a Red Hat DevNation tech talk on AWS Lambda and serverless Java with Bill Burke.
Have you ever tried Java on AWS Lambda but found that the cold-start latency and memory usage were far too high?
In this session, we will show how we optimized Java for serverless applications by leveraging GraalVM with Quarkus to
provide both supersonic startup speed and a subatomic memory footprint.""",

    """We really appreciate your interest and wanted to let you know that we have received your application.
There is strong competition for jobs at Intel, and we receive many applications. As a result, it may take some time to get back to you.
Whether or not this position ends up being a fit, we will keep your information per data retention policies,
so we can contact you for other positions that align to your experience and skill set.
"""
]

In [18]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)
messages = prepare(messages)

In [19]:
y_pred = model.predict(messages)
y_pred

array(['ham', 'spam', 'ham'], dtype='<U4')