In [1]:
import pandas as pd

messages = pd.read_csv('Data/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [2]:
messages.head()##ham basically defines that the message is not spam

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning & Preprocessing

In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## Creating TF_IDF Model

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)#taking 5000 frequent columns(words) rather than taking all words
X = cv.fit_transform(corpus).toarray()

In [5]:
X.shape

(5572, 5000)

In [6]:
y=pd.get_dummies(messages['label'])

In [7]:
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


**Instead of having two columns we can take only one column ,i.e, spam .If the spam column have value 0 that menas ham column has value 1 and vice-versa**

In [8]:
y=y.iloc[:,1].values

In [9]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

## Train_Test_split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

##  Model_Creation

In [11]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB().fit(X_train,y_train)

## prediction

In [12]:
y_pred=spam_detect_model.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score: {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[955   0]
 [ 26 134]]
Accuracy Score: 0.9766816143497757
Classification report:               precision    recall  f1-score   support

           0       0.97      1.00      0.99       955
           1       1.00      0.84      0.91       160

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

