In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.columns = ['label' , 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['label'] = df['label'].str.strip().str.lower().map({'ham': 0, 'spam': 1})

In [7]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,label,message
0,0,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
1,0,I sent my scores to sophas and i had to do sec...
2,1,We know someone who you know that fancies you....
3,0,Only if you promise your getting out as SOON a...
4,1,Congratulations ur awarded either å£500 of CD ...


In [9]:
X = df['message'] 
y = df['label']  

In [11]:
import spacy

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
def clean_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)
    

In [24]:
x_clean = X.apply(clean_text)

In [25]:
from sklearn.model_selection import train_test_split

In [18]:
X_train , X_test , y_train , y_test = train_test_split(
    x_clean, y, test_size = 0.2, random_state = 42
)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.957847533632287
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98       949
           1       0.99      0.72      0.84       166

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [27]:
def predict_spam(message):
    message_clean = clean_text(message)
    message_tfidf = vectorizer.transform([message_clean])
    prediction = model.predict(message_tfidf)
    return 'Spam' if prediction == 1 else 'Not Spam'

# Example
sample_message = "I am having a bad internet"
print(predict_spam(sample_message))

Not Spam
