# Importing the data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('spam_or_not_spam.csv')
data

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


# Preprocessing the data

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/zephyr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/zephyr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
stop_words_set = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.snowball.SnowballStemmer('english')

def clean_and_word_tokenize(string):
    if not isinstance(string, str):
        return ''
    
    string = nltk.tokenize.word_tokenize(string) # tokenization
    string = [i.lower() for i in string] # lower all of the tokens
    string = [i for i in string if i not in stop_words_set] # removing the words that are stop word
    string = [stemmer.stem(i) for i in string] # stemming
    string = ' '.join(string) # joining the words list to a singular string
    return string

In [5]:
data.iloc[:, 0] = data.iloc[:, 0].apply(clean_and_word_tokenize)
data

Unnamed: 0,email,label
0,date wed number aug number number number numbe...,0
1,martin post tasso papadopoulo greek sculptor b...,0
2,man threaten explos moscow thursday august num...,0
3,klez virus die alreadi prolif virus ever klez ...,0
4,ad cream spaghetti carbonara effect pasta make...,0
...,...,...
2995,abc good morn america rank number christma toy...,1
2996,hyperlink hyperlink hyperlink let mortgag lend...,1
2997,thank shop us gift occas free gift number numb...,1
2998,famous ebay market e cours learn sell complet ...,1


# Extracting the feature

In [6]:
import sklearn

In [7]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer()
X = vectorizer.fit_transform(data.iloc[:, 0])
y = data.iloc[:, 1]

# Training the model

In [8]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

classifier_model = sklearn.naive_bayes.MultinomialNB()
classifier_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


# Evaluating the model

In [9]:
score = classifier_model.score(X_test, y_test)
print(f"The accuracy of the model on the splitted test: {score:.1%}")

The accuracy of the model on the splitted test: 99.3%


In [10]:
def testing_model(string_input):
    string_input = clean_and_word_tokenize(string_input)
    string_input = vectorizer.transform([string_input])
    prediction = classifier_model.predict(string_input)
    return prediction

In [11]:
test_inputs = [
    'Hey, are we still meeting at 3pm?',
    'Congratulations, you have won a free iPhone!',
    'Can you send the report by tomorrow?',
    'URGENT! Claim your $1000 gift card now!',
    'Let’s grab lunch this week.',
    'You’ve been selected for a limited-time offer!',
    'Don’t forget the team call at 10am.',
    'Act now to get 50% off!',
    'Just checking in—how’s everything going?',
    'This isn’t spam. I really need your help with something.'
]

for test_input in test_inputs:
    prediction = testing_model(test_input)
    print(f"Input: {test_input}\nPrediction: {'Spam' if prediction[0] == 1 else 'Not Spam'}\n")

Input: Hey, are we still meeting at 3pm?
Prediction: Not Spam

Input: Congratulations, you have won a free iPhone!
Prediction: Spam

Input: Can you send the report by tomorrow?
Prediction: Spam

Input: URGENT! Claim your $1000 gift card now!
Prediction: Spam

Input: Let’s grab lunch this week.
Prediction: Not Spam

Input: You’ve been selected for a limited-time offer!
Prediction: Not Spam

Input: Don’t forget the team call at 10am.
Prediction: Not Spam

Input: Act now to get 50% off!
Prediction: Not Spam

Input: Just checking in—how’s everything going?
Prediction: Not Spam

Input: This isn’t spam. I really need your help with something.
Prediction: Not Spam

