# Spam Sms Detection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')

In [4]:
df = df.iloc[:,:2]
df.columns = ['label','text']
df['label'] = df['label'].map({'ham':0, 'spam':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'ham':0, 'spam':1})


In [5]:
print(df.columns)

Index(['label', 'text'], dtype='object')


In [10]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(f"[{string.punctuation}]",'',text)
  text = re.sub(r'\d+','',text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text


In [11]:
df['text'] = df['text'].apply(preprocess_text)

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [14]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(kernel='linear')
}

In [15]:
best_model = None
best_accuracy = 0

for name, model in models.items():
  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test,y_pred)
  print(f"{name}Accuracy:{acc:.4f}")

  if acc>best_accuracy:
    best_accuracy = acc
    best_model = model
print(f"Best model: {best_model} with accuracy {best_accuracy:.4f}")


Naive BayesAccuracy:0.9444
Logistic RegressionAccuracy:0.9534
SVMAccuracy:0.9704
Best model: SVC(kernel='linear') with accuracy 0.9704


In [17]:
def predict_sms(message):
  processed_message = preprocess_text(message)
  message_vector = vectorizer.transform([processed_message])
  prediction = best_model.predict(message_vector)
  return "Spam" if prediction[0] == 1 else "Ham"

sms = input("enter an sms message :")
print('prediction', predict_sms(sms))

enter an sms message :hello how are you
prediction Ham
