In [13]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [14]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rushil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rushil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rushil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
import sklearn
print(dir(sklearn.model_selection))

['BaseCrossValidator', 'BaseShuffleSplit', 'GridSearchCV', 'GroupKFold', 'GroupShuffleSplit', 'KFold', 'LearningCurveDisplay', 'LeaveOneGroupOut', 'LeaveOneOut', 'LeavePGroupsOut', 'LeavePOut', 'ParameterGrid', 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', 'RepeatedKFold', 'RepeatedStratifiedKFold', 'ShuffleSplit', 'StratifiedGroupKFold', 'StratifiedKFold', 'StratifiedShuffleSplit', 'TimeSeriesSplit', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__getattr__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_plot', '_search', '_split', '_validation', 'check_cv', 'cross_val_predict', 'cross_val_score', 'cross_validate', 'learning_curve', 'permutation_test_score', 'train_test_split', 'typing', 'validation_curve']


**Data Gathering**

In [19]:
df = pd.read_csv("SMSSpamCollection", sep = '\t', names = ['Label','Msg'] )
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Exploratory Data Analysis (EDA)**

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [21]:
df.isnull().sum() ## find missing values

Label    0
Msg      0
dtype: int64

In [22]:
df['Label'].value_counts() 

Label
ham     4825
spam     747
Name: count, dtype: int64

**Data Preprocessing**

In [28]:
corpus = []
lm = WordNetLemmatizer() ## Create lemmatiser object

for i in range(len(df)):
    # Remove characters that are not letters or numbers
    review = re.sub('[^a-zA-Z0-9]', ' ', df['Msg'][i])
    
    # Convert to lowercase to ensure uniformity
    review = review.lower()
    
    # Split the review into words
    review = review.split()
    
    # Remove stopwords from the review
    review = [data for data in review if data not in stopwords.words('english')]
    
    # Lemmatize the words to reduce them to their base form
    review = [lm.lemmatize(data) for data in review]
    #Join the processed words back into a single string
    review = " ".join(review)
    
    # Append the processed review to the corpus
    corpus.append(review)
 

In [29]:
df['Msg'][0] #access first message

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [30]:
len(df['Msg'])

5572

In [31]:
len(corpus)

5572

In [32]:
df['Msg'] = corpus
df.head()

Unnamed: 0,Label,Msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


**Model Building**

In [33]:
##data splitting
x = df['Msg'] ##dependent labels
y = df['Label'] ##independent labels

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)

In [35]:
len(x_train), len(y_train)

(3900, 3900)

In [36]:
len(x_test), len(y_test)

(1672, 1672)

**Perform Vectorization (Convert text data into vectors)**

In [39]:
##Using TF-IDF 
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
x_train_tfidf.shape

(3900, 6613)

**Pipeline**

In [45]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [47]:
text_mnb.fit(x_train, y_train)

In [48]:
##Accuracy score on testing data
y_pred_test = text_mnb.predict(x_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred_test)*100)

Accuracy Score:  95.93301435406698


In [49]:
##Accuracy Score on training data
y_pred_train = text_mnb.predict(x_train)
print("Accuracy Score: ", accuracy_score(y_train, y_pred_train)*100)

Accuracy Score:  98.33333333333333


In [50]:
#Confusion Matrix on Testing Data
y_pred_test = text_mnb.predict(x_test)
print("Confusion Matrix on Test Data:\n", confusion_matrix(y_test,y_pred_test))

Confusion Matrix on Test Data:
 [[1457    0]
 [  68  147]]


In [51]:
#Classification Report on Testing Data
y_pred_test = text_mnb.predict(x_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_test))

Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1457
        spam       1.00      0.68      0.81       215

    accuracy                           0.96      1672
   macro avg       0.98      0.84      0.89      1672
weighted avg       0.96      0.96      0.96      1672



**Prediction on User Data**

In [53]:
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return [review]

In [56]:
user_data = df['Msg'][0]
print(user_data)
user_data = preprocess_data(user_data)
user_data

go jurong point crazy available bugis n great world la e buffet cine got amore wat


['go jurong point crazy available bugis n great world la e buffet cine got amore wat']

In [59]:
text_mnb.predict(user_data)[0]

'ham'

In [60]:
class prediction:
    
    def __init__(self,data):
        self.data = data
        
    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]
    
    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()
        
        if text_mnb.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'
            
        else:
            return 'This Message is Ham'  

In [61]:
df.head()

Unnamed: 0,Label,Msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [62]:
user_data = df['Msg'][2]
print(user_data)
prediction(user_data).user_data_prediction()

free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18


'This Message is Spam'

In [63]:
user_data = df['Msg'][3]
print(user_data)
prediction(user_data).user_data_prediction()

u dun say early hor u c already say


'This Message is Ham'

In [64]:
user_data = df['Msg'][11]
print(user_data)
prediction(user_data).user_data_prediction()

six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply hl 4 info


'This Message is Spam'

In [68]:
user_data = df['Msg'][113]
print(user_data)
prediction(user_data).user_data_prediction()

ok wif co like 2 try new thing scared u dun like mah co u said loud


'This Message is Ham'