In [1]:
# Data is tab seprated instead of space seprated So we use separator

In [2]:
import pandas as pd

In [3]:
# Loading data-

messages = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=["label","message"])

In [4]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # for lemmanization

In [7]:
# Data Cleaning & Preprocessing-

In [8]:
# Creating object-

wordnet = WordNetLemmatizer()

In [9]:
corpus = []
for i in range (0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i]) # removing comma,spaces etc from the sentences other than a-z, A-Z
    review = review.lower() # lowering the words of each sentences
    review = review.split() # getting list of words
    
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))] # list comprehension
    review = ''.join(review) # joining list of words into review
    corpus.append(review) # appending to the list we created

In [10]:
# Creating Bag of Words model-

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000) # Selecting max 5000 features you can change it on your own; That means I have selected top 5000 most frequent words.
X = cv.fit_transform(corpus).toarray()

In [13]:
y = pd.get_dummies(messages['label']) # Model wont understand ham & spam so we converted them; whenever there is ham or spam it will be indicated by 1
y = y.iloc[:,1].values # so we converted them into just one column (ham=0,spam=1)

In [14]:
# Train-Test split-

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [16]:
# Training Model usin Naive Bayes classifier- Since Naive Byes model works best for NLP.

In [17]:
from sklearn.naive_bayes import MultinomialNB # It is Naive Byes library
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [18]:
y_pred = spam_detect_model.predict(X_test)

In [19]:
# Comparing y_test and y_pred

In [20]:
from sklearn.metrics import confusion_matrix # Confusion matrix- Will give 2+2 dimensional matrix which will tell you that how many number of elements are correctly predicted

In [21]:
confusion_m = confusion_matrix(y_test,y_pred)
confusion_m

array([[955,   0],
       [156,   4]], dtype=int64)

In [22]:
# Checking accuracy score

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8600896860986547

In [25]:
# Accuracy is 86%