In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('sms_spam_collection.csv', sep='\t', header=None) 
df.columns = ['label', 'message']  
df.head()
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)


In [7]:
pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Classifier Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Naive Bayes Classifier Accuracy: 0.9919282511210762
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [11]:
cv_scores = cross_val_score(pipeline, df['message'], df['label'], cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.98475336 0.98654709 0.98294434 0.98294434 0.98473968]
Mean Cross-Validation Accuracy: 0.9843857629356497


In [12]:
param_grid = {'countvectorizer__ngram_range': [(1, 1), (1, 2)], 'multinomialnb__alpha': [0.1, 1.0, 10.0]}
grid = GridSearchCV(pipeline, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)

Best Parameters: {'countvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 0.1}
Best Cross-Validation Score: 0.9851904697196178


In [13]:
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Model Accuracy:", accuracy_best)
print("Classification Report for Best Model:\n", classification_report(y_test, y_pred_best))

Best Model Accuracy: 0.9910313901345291
Classification Report for Best Model:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.99      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

