SPAM SMS DETECTION

1 - Importing Necessary Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

2 - Loading and Exploring the Dataset

In [4]:
# Load the dataset with a different encoding
file_path = 'D:/Mustufahussain/CodSoft/Machine Learning/Dataset/Spam/spam.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset
print("Dataset Overview:")
print(data.head())


Dataset Overview:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


3 - Data Preprocessing

In [5]:
# Clean the text data: remove extra spaces, convert to lowercase
print("\nPreprocessing Text Data...")

data['v2'] = data['v2'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

# Encoding the labels ('ham' as 0, 'spam' as 1)
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})

# Show cleaned dataset
print("\nCleaned Dataset Preview:")
print(data.head())


Preprocessing Text Data...

Cleaned Dataset Preview:
   v1                                                 v2 Unnamed: 2  \
0   0  go until jurong point, crazy.. available only ...        NaN   
1   0                      ok lar... joking wif u oni...        NaN   
2   1  free entry in 2 a wkly comp to win fa cup fina...        NaN   
3   0  u dun say so early hor... u c already then say...        NaN   
4   0  nah i don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


4 - Feature Extraction using TF-IDF

In [6]:
# Convert the text data into numerical features using TF-IDF
print("\nConverting Text Data to Numerical Features using TF-IDF...")

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(data['v2'])
y = data['v1']

print(f"\nShape of X: {X.shape}")


Converting Text Data to Numerical Features using TF-IDF...

Shape of X: (5572, 5000)


5 - Train-Test Split

In [7]:
# Split the data into training and testing sets
print("\nSplitting the data into training and testing sets...")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nShape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}")


Splitting the data into training and testing sets...

Shape of X_train: (4457, 5000), Shape of X_test: (1115, 5000)


6 - Model Training with Naive Bayes

In [8]:
# Train a Naive Bayes model
print("\nTraining Naive Bayes model...")

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test)

# Evaluate the model
print("\nNaive Bayes Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, nb_predictions)}")
print(confusion_matrix(y_test, nb_predictions))
print(classification_report(y_test, nb_predictions))


Training Naive Bayes model...

Naive Bayes Model Evaluation:
Accuracy: 0.9757847533632287
[[965   0]
 [ 27 123]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.82      0.90       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



7 - Model Training with Logistic Regression

In [9]:
# Train a Logistic Regression model
print("\nTraining Logistic Regression model...")

log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions
log_reg_predictions = log_reg_model.predict(X_test)

# Evaluate the model
print("\nLogistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, log_reg_predictions)}")
print(confusion_matrix(y_test, log_reg_predictions))
print(classification_report(y_test, log_reg_predictions))


Training Logistic Regression model...

Logistic Regression Model Evaluation:
Accuracy: 0.9479820627802691
[[962   3]
 [ 55  95]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.63      0.77       150

    accuracy                           0.95      1115
   macro avg       0.96      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115



8 - Model Training with Support Vector Machine (SVM)

In [10]:
# Train a Support Vector Machine (SVM) model
print("\nTraining Support Vector Machine (SVM) model...")

svm_model = SVC()
svm_model.fit(X_train, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test)

# Evaluate the model
print("\nSupport Vector Machine (SVM) Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, svm_predictions)}")
print(confusion_matrix(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


Training Support Vector Machine (SVM) model...

Support Vector Machine (SVM) Model Evaluation:
Accuracy: 0.9748878923766816
[[964   1]
 [ 27 123]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.82      0.90       150

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

