In [3]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [4]:
### use mushroom database
df = pd.read_csv("datasets/spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data = df.to_numpy()
data

array([['ham',
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
        nan, nan, nan],
       ['ham', 'Ok lar... Joking wif u oni...', nan, nan, nan],
       ['spam',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        nan, nan, nan],
       ...,
       ['ham',
        'Pity, * was in mood for that. So...any other suggestions?', nan,
        nan, nan],
       ['ham',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
        nan, nan, nan],
       ['ham', 'Rofl. Its true to its name', nan, nan, nan]], dtype=object)

In [7]:
x = data[:, 1]
y = data[:, 0]
x.shape, y.shape

((5572,), (5572,))

In [8]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [9]:
def getStem(review):
    review = review.lower()
    tokens =tokenizer.tokenize(review) # breaking into small words
    remove_stpwrds = [w for w in tokens if w not in sw]
    stemmed_wrds = [ps.stem(token) for token in remove_stpwrds]
    clean_review = ' '.join(stemmed_wrds)
    return clean_review

In [10]:
# get a clean document
def getDoc(document):
    d = []
    for sentence in document:
        d.append(getStem(sentence))
    return d

In [11]:
stemmed_doc = getDoc(x)

In [13]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [15]:
cv = CountVectorizer()

#create my vocab
vc = cv.fit_transform(stemmed_doc)
x = vc.todense()

# train test split
x_train, x_test, y_train,y_test = train_test_split(x, y, random_state=42, test_size=0.33)


In [16]:
# NB from sklearn
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.977705274605764

In [18]:
messages = [
    """
        Really. The last 18 months have been rough on all of us. Unemployment and erosion in savings have led to somewhat economic turmoil & unnecessary stress. But in the case of students, it has to do a lot with uncertainty. 
        What next? What's ahead? What should I do? How can I make myself emerge as my best version post-pandemic? We get that a lot. And in such times, we are doing our bit of bringing in some stability & light at the end of the tunnel by offering GATE scholarships up to ₹30 lakhs to deserving students like you.
        We'd absolutely love it if as many students as possible took our GATE Scholarship Test on the 25'th of July & claim it all. Including you. Just by registering yourself here (Takes less than 10 seconds), take charge of the future you deserve.
        Abhijit Nath,    Program Manager, Vidyalankar Infinite.""",
    
    """
        Yocket
        Greetings from Yocket and StupidSid!
        We trust that you're doing well and staying safe during these troubled times.
        If you have loved using either one or both the platforms, we have some exciting news for you!!
        As you know, StupidSid and Yocket strive to help students and currently we’ve 500,000 users on Yocket.
        We trust you had a good experience with us in the past. Since you know what we offer as a platform for students, how about getting to know how we work internally and help students just like you and aspirants who wish to study abroad? 
        If you are someone who believes in the power of technology and likes to challenge traditional ways of working, then Yocket is the place to be! Join our young and dynamic team in this journey to becoming a great company with jolly employees!
        We are currently hiring in:- Technology, Marketing, Sales, Counseling and many more…
""",
    
    """This mail is to inform you regarding Placement training.
        Schedule for Batch Number - 02
        Session Details are as follows
        ·       Title - PICT - Batch 02 - Aptitude Session - 02
        ·       Date & Time - Jul 21, 2021 03:00 PM India
        ·       Link - https://us02web.zoom.us/meeting/register/tZMtcuCvqjwuEtAcwDALnUScrl1IZ7_aj_4j
        Note:
        •        All the students are requested to register in advance for the meeting.
        •        After registering, students will receive a confirmation email containing information about joining the meeting.
        •        CC ID is compulsory to be there in username. 
        •        IF THE NAME OR USER ID IS NOT IN THE FORMAT (CC ID_YOUR NAME) THEN ATTENDANCE FOR THAT STUDENT WILL NOT BE CONSIDERED."""

]

In [20]:
def prepare(messages) :
    d = getDoc(messages)
    ## don't do fit_transform ! 
    ## it will create the new VOCAB
    return cv.transform(d)

messages = prepare(messages)

In [22]:
y_pred = model.predict(messages)
y_pred

array(['ham', 'ham', 'spam'], dtype='<U4')