In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Loading dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
data = pd.read_csv(url, sep="\t", header=None, names=["label", "message"])

# Preprocessing (convert labels to binary values: ham=0, spam=1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})


ValueError: Multiple files found in ZIP file. Only one file per ZIP: ['SMSSpamCollection', 'readme']

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip


--2024-12-29 10:40:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [  <=>               ] 198.65K   838KB/s    in 0.2s    

2024-12-29 10:40:55 (838 KB/s) - ‘smsspamcollection.zip’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [3]:
# Loading the dataset from the extracted file
data = pd.read_csv('SMSSpamCollection', sep="\t", header=None, names=["label", "message"])

# Check the first few rows of the dataset
data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Preprocessing (convert labels to binary values: ham=0, spam=1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Splitting the data into train and test sets
X = data['message']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split sizes
print(f"Training data size: {len(X_train)}")
print(f"Test data size: {len(X_test)}")


Training data size: 4457
Test data size: 1115


In [5]:
# Vectorizing text messages using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Check the shape of the TF-IDF matrices
print(f"TF-IDF Train shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test shape: {X_test_tfidf.shape}")


TF-IDF Train shape: (4457, 7441)
TF-IDF Test shape: (1115, 7441)


In [6]:
# Defining and training the Neural Network model
model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Check the training status
print("Model training complete.")


Model training complete.


In [7]:
# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 0.9919


In [8]:
def predict_message(message):
      # Transforming the input message to TF-IDF format
    message_tfidf = vectorizer.transform([message])

                  # Predicting the class (0 for ham, 1 for spam)
    prediction = model.predict_proba(message_tfidf)

                              # Returning the likeliness and the class label
    likeliness = prediction[0][1]  # Likelihood of the message being 'spam'
    label = "spam" if likeliness > 0.5 else "ham"

    return [likeliness, label]


In [9]:
# Testing the model with sample messages
print(predict_message("Congrats! You've won a free lottery. Call now!"))
print(predict_message("Hey, how are you doing today?"))


[0.07352002868791528, 'ham']
[7.759428175104481e-05, 'ham']


In [10]:
# Spam examples
print(predict_message("Congratulations! You've won a $1000 gift card. Claim now!"))
print(predict_message("Limited time offer! Get a free iPhone now. Visit [link] to claim!"))

# Ham examples
print(predict_message("Hey, are we still on for the meeting tomorrow?"))
print(predict_message("Just checking in to see how you're doing!"))


[0.9987971048911759, 'spam']
[0.3857186011273212, 'ham']
[0.0001253419378552186, 'ham']
[0.0001756014816565007, 'ham']
