In [26]:
import pandas as pd

In [2]:
sms_text = pd.read_csv("spam.csv", encoding='latin-1')

In [3]:
sms_text.dropna(how="any", inplace=True, axis=1)
sms_text.columns = ['label', 'message']

In [4]:
sms_text.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### EDA

In [5]:
sms_text.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
sms_text.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


**There are more ham messages than spam messages**

### Encoding Labels

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(sms_text['label'])

sms_text['label_encoded'] = le.transform(sms_text['label'])
sms_text.head()

Unnamed: 0,label,message,label_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
sms_text['length_message'] = sms_text.message.apply(len)
sms_text.head()

Unnamed: 0,label,message,label_encoded,length_message
0,ham,"Go until jurong point, crazy.. Available only ...",0,111
1,ham,Ok lar... Joking wif u oni...,0,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155
3,ham,U dun say so early hor... U c already then say...,0,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61


In [9]:
sms_text[sms_text.label=='ham'].describe()

Unnamed: 0,label_encoded,length_message
count,4825.0,4825.0
mean,0.0,71.023627
std,0.0,58.016023
min,0.0,2.0
25%,0.0,33.0
50%,0.0,52.0
75%,0.0,92.0
max,0.0,910.0


In [10]:
sms_text[sms_text.label=='spam'].describe()

Unnamed: 0,label_encoded,length_message
count,747.0,747.0
mean,1.0,138.866131
std,0.0,29.183082
min,1.0,13.0
25%,1.0,132.5
50%,1.0,149.0
75%,1.0,157.0
max,1.0,224.0


**Spam messages have more characters**

## Text Pre processing

In [11]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [12]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/premmevada/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/premmevada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Remove punctuation and stop words and stemming

In [13]:
def process_text(text):
    stemmer = PorterStemmer()
    #tokenize the text
    tokens = word_tokenize(text)
    #Convert to lower case and remove stop words
    STOPWORDS = set(stopwords.words('english'))
    stemmed_tokens = [stemmer.stem(word.lower()) for word in tokens 
                      if word.isalpha() and word.lower() not in STOPWORDS]
    return ' '.join(stemmed_tokens)

In [14]:
sms_text['cleaned_message'] = sms_text['message'].apply(process_text)

In [15]:
sms_text['cleaned_message'].iloc[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

## Vectorisation

#### CountVectoriser

In [16]:
X = sms_text['cleaned_message']
y = sms_text['label_encoded']

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

4179 4179
1393 1393


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

In [19]:
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)

## Machine learning 

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate the Naive Bayes classifier
nb = MultinomialNB()

# Train the classifier on the training data
nb.fit(X_train_dtm, y_train)

# Predict the labels for the test set
y_pred = nb.predict(X_test_dtm)

# Evaluate the model performance
accuracy = metrics.accuracy_score(y_test, y_pred)
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
classification_report = metrics.classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report)

Accuracy: 0.9784637473079684
Confusion Matrix:
 [[1191   11]
 [  19  172]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      1202
           1       0.94      0.90      0.92       191

    accuracy                           0.98      1393
   macro avg       0.96      0.95      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [21]:
from sklearn.model_selection import cross_val_score

# Initializing the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Performing cross-validation
cv_scores = cross_val_score(nb_classifier, X_train_dtm, y_train, cv=5, scoring='accuracy')

print("Cross-validation scores (5-fold):", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Fitting the model to the entire training dataset
nb_classifier.fit(X_train_dtm, y_train)

# Evaluating test set
from sklearn.metrics import f1_score

y_pred = nb_classifier.predict(X_test_dtm)
test_f1_score = f1_score(y_test, y_pred, average='weighted')

print("Test set F1 score:", test_f1_score)

Cross-validation scores (5-fold): [0.97727273 0.97129187 0.97607656 0.96650718 0.95928144]
Mean CV accuracy: 0.97008595249692
Test set F1 score: 0.9782691282331716


In [25]:
def predict_message_spam_or_ham(message):
    # Pre-process the user input
    cleaned_message = process_text(message)  
    
    # Vectorize the input using the same vectorizer
    message_vect = vect.transform([cleaned_message])
    
    # Predict using the trained model
    prediction = nb_classifier.predict(message_vect)
    
    # Return the prediction result
    return 'ham' if prediction[0] == 0 else 'spam'

# Prompt the user to input a message
user_message = input("Enter a message to predict if it's spam or ham: ")

# Make a prediction based on the user input
prediction_result = predict_message_spam_or_ham(user_message)

# Print the prediction result
print(f'The message is predicted to be: {prediction_result}')


Enter a message to predict if it's spam or ham:  hi there how have you been ?


The message is predicted to be: ham
