# Prediction of spam text using scikit-learn

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
##1. Load the spam.tsv file using the tab separator option in pd.read_csv and print the first 5 rows
df = pd.read_csv('spam.tsv', sep='\t', header=None)
df = df.rename(columns = {0: 'class', 1: 'message'})
df.head()

Unnamed: 0,class,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [4]:
# Print the number of records of type spam and type ham
print(df['class'].value_counts())
print("")
print('There are 746 and 4821 records in type spam and type ham respectively.')

ham     4821
spam     746
Name: class, dtype: int64

There are 746 and 4821 records in type spam and type ham respectively.


In [5]:
#2. Replace the Class column with 0s where it is ham and 1s where it is spam
df['class'] = df['class'].map({'spam': 1, 'ham': 0})
df

Unnamed: 0,class,message
0,0,I've been searching for the right words to tha...
1,1,Free entry in 2 a wkly comp to win FA Cup fina...
2,0,"Nah I don't think he goes to usf, he lives aro..."
3,0,Even my brother is not like to speak with me. ...
4,0,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,1,This is the 2nd time we have tried 2 contact u...
5563,0,Will ü b going to esplanade fr home?
5564,0,"Pity, * was in mood for that. So...any other s..."
5565,0,The guy did some bitching but I acted like i'd...


In [6]:
#3. Initialize a CountVectorizer using English stop-words
# Set X to be the values in the Message column of the dataset
# Set y to be the values in Class column of the dataset
# Set y to be of type int

CV = CountVectorizer(stop_words="english")
X = df['message']
y = df['class']
y = y.astype(int)

In [7]:
#4. Use train_test_split function to create X_train, X_test, y_train & y_test with a test size ratio of 0.2
# Call the fit_transform method, passing X_train as argument and store it as X_train_CV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train_CV = CV.fit_transform(X_train)

In [8]:
#5. Fit a multinomial Naive Bayes model on X_train_CV & y_train

NB = MultinomialNB()
NB.fit(X_train_CV, y_train)

MultinomialNB()

In [9]:
#6. Testing the accuracy
# Transform X_test using CV.transform & predict y_predict using the trained model
# calculate the accuracy score by using y_test and comparing it against y_pred

X_test1 = CV.transform(X_test)
y_pred = NB.predict(X_test1)
print(accuracy_score(y_test, y_pred))

0.9847396768402155


In [10]:
#7. Test a random message using the below code: Enter your own string in the message and check what the model predicts!

msg = "Congratulations! You have won the lottery!"
msgInput = CV.transform([msg])
predict = NB.predict(msgInput)
if(predict[0]==0):
    print("Spam detected!")
else:
    print("Doesn't look like spam")

Doesn't look like spam
