Dataset: https://www.kaggle.com/code/adepvenugopal/detecting-sms-spam-using-machine-learning/input

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import re

In [11]:
# Load dataset
df = pd.read_csv('sms.csv')

# Remove null values
df = df.dropna()

In [12]:
df.head()

Unnamed: 0,label,comment
0,0,Hope you are having a good week. Just checking in
1,0,K..give back my thanks.
2,0,Am also doing in cbe only. But have to pay.
3,1,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,1,okmail: Dear Dave this is your final notice to...


In [13]:
# Function to clean comments
def clean_comment(comment):
    # Remove URLs
    comment = re.sub(r'http\S+|www\S+|https\S+', '', comment, flags=re.MULTILINE)
    # Remove special characters and numbers
    comment = re.sub(r'\@\w+|\#','', comment)
    comment = re.sub(r'[^A-Za-z\s]', '', comment)
    # Convert to lowercase
    comment = comment.lower()
    return comment

# Clean the comments
df['comment'] = df['comment'].apply(clean_comment)

In [14]:
df.head()

Unnamed: 0,label,comment
0,0,hope you are having a good week just checking in
1,0,kgive back my thanks
2,0,am also doing in cbe only but have to pay
3,1,complimentary star ibiza holiday or cash nee...
4,1,okmail dear dave this is your final notice to ...


In [15]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.2, random_state=42)

# Vectorize the comments
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [16]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vect, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vect)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.96


In [17]:
# Save the model and vectorizer
joblib.dump(model, 'sms_classifier_model.joblib')
joblib.dump(vectorizer, 'sms_vectorizer.joblib')

['sms_vectorizer.joblib']

In [19]:
# Load the saved model and vectorizer for single prediction
loaded_model = joblib.load('sms_classifier_model.joblib')
loaded_vectorizer = joblib.load('sms_vectorizer.joblib')

# Single prediction
def predict_comment(comment):
    cleaned_comment = clean_comment(comment)
    vect_comment = loaded_vectorizer.transform([cleaned_comment])
    prediction = loaded_model.predict(vect_comment)
    return 'spam' if prediction[0] == 1 else 'ham'

# sample_comment = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now."
sample_comment = "How are you man, please call me once you are back."
prediction = predict_comment(sample_comment)
print(f'The sample comment is predicted to be: {prediction}')

The sample comment is predicted to be: ham
