## <span style = "color:maroon">Spam classifier using NLP</span>

### 1. Importing all the libraries

In [30]:
# Importing all the libaries 

import nltk
import pandas as pd
import numpy as np
import re
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import jaccard_score, f1_score, precision_score

### 2. Data preprocessing

#### 2.1 Reading the data

In [5]:
messages = pd.read_csv('./smsspamcollection/SMSSpamCollection', sep='\t',
                      names = ['label', 'message'])

#### 2.2 Lemmatization

In [12]:
lmt = WordNetLemmatizer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^A-Za-z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lmt.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

### 3. Using the Bag of words model

In [19]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

### 4. Model training

In [22]:
# Creating the label data
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [26]:
# Performing a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state = 42)

In [28]:
# Training the model
model = MultinomialNB().fit(X_train, y_train)

### 5. Model prediction

In [29]:
y_pred = model.predict(X_test)

In [31]:
print("The jaccard score is:", jaccard_score(y_test, y_pred))

The jaccard score is: 0.8313953488372093


In [32]:
print("The F1 score is:", f1_score(y_test, y_pred))

The F1 score is: 0.9079365079365079


In [33]:
print("The precision is:", precision_score(y_test, y_pred))

The precision is: 0.8614457831325302
