In [None]:
# Importing Liraries
import pandas as pd 
import nltk, re 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
# Download stopword by using nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\M
[nltk_data]     Ghous\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#Load dataset
dataset=pd.read_csv('spam.csv')

In [None]:
#Check top 5 dataset
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Count spam or ham value
dataset.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
# Using PoterStemmer for stemming
ps=PorterStemmer()
stop_words=set(stopwords.words('english'))

In [None]:
# Preprocessing the dataset 
def preprocessing(text):
    text=text.lower()
    text=re.sub(r'[^a-zA-Z]',' ',text)
    token=text.split()
    token=[ps.stem(word) for word in token if word not in stop_words]
    return ' '.join(token)
dataset['clean_text']=dataset['Message'].apply(preprocessing)

In [None]:
# Map the category features 'ham or spam into 0 or 1'
dataset['category_num']=dataset['Category'].map({'ham':0,'spam':1})

In [None]:
# used Tfidf Vectorizer for convert text into vector
tfidf=TfidfVectorizer()
X_tfidf=tfidf.fit_transform(dataset['clean_text'])

In [None]:
# using bag of words to convert text into vector
cv=CountVectorizer(binary=True,ngram_range=(3,3))
X_cv=cv.fit_transform(dataset['clean_text'])

In [None]:
# Split the dataset and train data by using tfidf
X_train,X_test,Y_train,Y_test=train_test_split(X_tfidf,dataset['category_num'],test_size=0.2,random_state=43)

In [None]:
# Deploy the model
models={
    'Naive_Bayes':MultinomialNB(),
    'Logistic_Regression':LogisticRegression(),
    'SVM':LinearSVC()
}

In [None]:
# Create classification report and confusion matrix of all three models which used tfidf for dataset
for name, model in models.items():
    print(f"\n{name} Results:")
    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    print(confusion_matrix(Y_test,prediction))
    print(classification_report(Y_test,prediction))


Naive_Bayes Results:
[[952   0]
 [ 36 127]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       952
           1       1.00      0.78      0.88       163

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Logistic_Regression Results:
[[951   1]
 [ 44 119]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       952
           1       0.99      0.73      0.84       163

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


SVM Results:
[[950   2]
 [ 18 145]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       952
           1       0.99      0.89      0.94       163

    accuracy                           0.98    

In [None]:
# Split the dataset and train data by using bag of words
X_train,X_test,Y_train,Y_test=train_test_split(X_cv,dataset['category_num'],test_size=0.2,random_state=43)

In [None]:
# Deploy models
models={
    'Naive_Bayes':MultinomialNB(),
    'Logistic_Regression':LogisticRegression(),
    'SVM':LinearSVC()
}

In [None]:
# Create classification report and confusion matrix of all three models which used Bag of word for dataset
for name, model in models.items():
    print(f"\n{name} Results:")
    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    print(confusion_matrix(Y_test,prediction))
    print(classification_report(Y_test,prediction))


Naive_Bayes Results:
[[669 283]
 [  7 156]]
              precision    recall  f1-score   support

           0       0.99      0.70      0.82       952
           1       0.36      0.96      0.52       163

    accuracy                           0.74      1115
   macro avg       0.67      0.83      0.67      1115
weighted avg       0.90      0.74      0.78      1115


Logistic_Regression Results:
[[952   0]
 [ 81  82]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       952
           1       1.00      0.50      0.67       163

    accuracy                           0.93      1115
   macro avg       0.96      0.75      0.81      1115
weighted avg       0.93      0.93      0.92      1115


SVM Results:
[[952   0]
 [ 75  88]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       952
           1       1.00      0.54      0.70       163

    accuracy                           0.93    