In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Step 1: Data Loading and Preprocessing
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Convert the labels to binary values: 'spam' -> 1 and 'ham' -> 0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['message'])
y = df['label']

# Train the model
model = MultinomialNB()
model.fit(X, y)

def predict_spam_ham(message):
    # Convert the message to the same format as the training data
    message_vectorized = vectorizer.transform([message])
    
    # Make the prediction
    prediction = model.predict(message_vectorized)
    
    # Convert the prediction to 'spam' or 'ham'
    return 'spam' if prediction[0] == 1 else 'ham'

# Example usage:
message = "Congratulations! You've won a free cruise. Click here to claim your prize."
print(f"The message '{message}' is predicted to be: {predict_spam_ham(message)}")

message = "Hey, are you free for lunch tomorrow?"
print(f"The message '{message}' is predicted to be: {predict_spam_ham(message)}")


The message 'Congratulations! You've won a free cruise. Click here to claim your prize.' is predicted to be: spam
The message 'Hey, are you free for lunch tomorrow?' is predicted to be: ham
