# SMS Classifier Model 

In [1]:
import pandas as pd
import numpy as np
import string
import re

In [2]:
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]  # Select only the label and text columns
data.columns = ['label', 'text']

In [3]:
# Preprocessing
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])

In [4]:
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\b\d+\b', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()  # Tokenization
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    processed_text = ' '.join(filtered_tokens)  # Reconstruct the text
    return processed_text

data['text'] = data['text'].apply(preprocess_text)

In [5]:
# Split the data into train and test sets
train_size = int(0.8 * len(data))
train_data = data[:train_size]
test_data = data[train_size:]

In [6]:
# Train the classifier
word_counts_spam = {}
word_counts_ham = {}

for index, row in train_data.iterrows():
    words = row['text'].split()
    for word in words:
        if row['label'] == 'spam':
            word_counts_spam[word] = word_counts_spam.get(word, 0) + 1
        else:
            word_counts_ham[word] = word_counts_ham.get(word, 0) + 1

spam_probability = len(train_data[train_data['label'] == 'spam']) / len(train_data)

In [7]:
# Predictions
def predict(message):
    words = message.split()
    spam_score = 0
    ham_score = 0
    for word in words:
        spam_score += np.log((word_counts_spam.get(word, 0) + 1) / (len(word_counts_spam) + len(word_counts_ham)))
        ham_score += np.log((word_counts_ham.get(word, 0) + 1) / (len(word_counts_spam) + len(word_counts_ham)))
    spam_score += np.log(spam_probability)
    ham_score += np.log(1 - spam_probability)
    return 'spam' if spam_score > ham_score else 'ham'

In [8]:
# Evaluate the model
y_true = test_data['label'].tolist()
y_pred = [predict(text) for text in test_data['text'].tolist()]

In [9]:
# Calculate accuracy
accuracy = sum(1 for true, pred in zip(y_true, y_pred) if true == pred) / len(y_true)
print("Accuracy:", accuracy)

Accuracy: 0.9668161434977578
