In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('spamsms.csv', encoding='latin-1')

# Display first few rows
print(df.head())

# Rename columns if needed
df = df.rename(columns={df.columns[0]: 'label', df.columns[1]: 'message'})
df = df[['label', 'message']]

   type                                               text Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download stopwords if not done
import nltk
nltk.download('stopwords')

ps = PorterStemmer()

def clean_text(text):
    text = text.lower()                           # Lowercase
    text = re.sub(r'\d+', '', text)               # Remove digits
    text = re.sub(r'[^\w\s]', '', text)           # Remove punctuation
    text = text.split()                           # Tokenize
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return " ".join(text)

df['cleaned'] = df['message'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   ---------- ----------------------------- 2.9/10.7 MB 15.2 MB/s eta 0:00:01
   ---------------------- ----------------- 6.0/10.7 MB 14.7 MB/s eta 0:00:01
   --------------------------------- ------ 8.9/10.7 MB 14.6 MB/s eta 0:00:01
   ---------------------------------------- 10.7/10.7 MB 13.9 MB/s eta 0:00:00
Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl (41.3 MB)
   ---------------------------------------- 0.0/41.3 MB ? eta -:--:--
   -- ------------------------------------- 2.4/41.3 MB 14.9 MB/s eta 0:00:03
   ----

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned']).toarray()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert 'ham' to 0 and 'spam' to 1
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [10]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9766816143497757
Confusion Matrix:
 [[965   0]
 [ 26 124]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [11]:
def predict_email(text):
    cleaned = clean_text(text)
    vect_text = vectorizer.transform([cleaned]).toarray()
    pred = model.predict(vect_text)
    return "Spam" if pred[0] == 1 else "Ham"

# Example
print(predict_email("Congratulations! You've won a $1000 gift card. Claim now!"))

Spam
