In [75]:
import numpy as np
import pandas as pd
# import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# **DATA CLEANING**

In [87]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking for any missing values

In [89]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


Checking for duplicate data

In [90]:
df.duplicated().sum()

415

Removing duplicate data

In [91]:
df.drop_duplicates(inplace = True)

In [92]:
df.duplicated().sum()

0

Verifying unique categories

In [93]:
print(df['Category'].unique())

['ham' 'spam']


Standardizing Category

In [94]:
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

Validating distribution

In [95]:
print(df['Category'].value_counts())


Category
0    4516
1     641
Name: count, dtype: int64


Converting all Messages to Lower case

In [99]:
df["Message"] = df["Message"].str.lower()


Removing special charachters

In [100]:
df["Message"] = df["Message"].apply(lambda x: re.sub(r"[^a-z0-9\s]", "", x))

Trimming extra spaces

In [101]:
df["Message"] = df["Message"].str.strip()

Save the Cleaned Data

In [109]:
df.to_csv("cleaned_spam.csv", index=False)

# **BACK PROPOGATION**

In [110]:
df = pd.read_csv("cleaned_spam.csv")
df.head()

Unnamed: 0,Category,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


Checking for NaN values in Message

In [111]:
df['Message'].isna().sum()

2

Replace NaN values in 'Message' column with empty strings





In [112]:
df['Message'] = df['Message'].fillna('')

Convert text into numerical features (TF-IDF)

In [113]:
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to 5000 most important words
X = vectorizer.fit_transform(df["Message"]).toarray()
y = df["Category"].values  # Target labels (0 = ham, 1 = spam)

Split data into training and testing sets

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define Neural Network with Forward and Backward Propagation

In [115]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        # Initialize weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01  # Input → Hidden
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01  # Hidden → Output
        self.b2 = np.zeros((1, output_size))
        self.lr = learning_rate  # Learning rate

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))  # Sigmoid activation function

    def sigmoid_derivative(self, x):
        return x * (1 - x)  # Derivative for backpropagation

    def forward(self, X):
        # Forward propagation
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.sigmoid(self.z2)  # Output prediction
        return self.a2

    def backward(self, X, y, output):
        # Compute the error
        error = y - output  # Difference between actual and predicted values
        d_output = error * self.sigmoid_derivative(output)  # Gradient of output layer

        # Backpropagate error to hidden layer
        error_hidden = d_output.dot(self.W2.T)
        d_hidden = error_hidden * self.sigmoid_derivative(self.a1)

        # Update weights and biases using gradient descent
        self.W2 += self.a1.T.dot(d_output) * self.lr
        self.b2 += np.sum(d_output, axis=0, keepdims=True) * self.lr
        self.W1 += X.T.dot(d_hidden) * self.lr
        self.b1 += np.sum(d_hidden, axis=0, keepdims=True) * self.lr

    def train(self, X, y, epochs=1000):
        for epoch in range(epochs):
            output = self.forward(X)  # Forward pass
            self.backward(X, y.reshape(-1, 1), output)  # Backward pass

            # Print loss every 100 epochs
            if epoch % 100 == 0:
                loss = np.mean((y.reshape(-1, 1) - output) ** 2)  # Mean Squared Error
                print(f"Epoch {epoch}, Loss: {loss:.4f}")


### Train the Model

In [118]:
# Define network parameters
input_size = X_train.shape[1]  # Number of input features (TF-IDF features)
hidden_size = 10  # Hidden layer size
output_size = 1  # Single output (spam or ham)

# Initialize and train the neural network
nn = NeuralNetwork(input_size, hidden_size, output_size)
nn.train(X_train, y_train, epochs=1000)


Epoch 0, Loss: 0.2420
Epoch 100, Loss: 0.1224
Epoch 200, Loss: 0.1224
Epoch 300, Loss: 0.1224
Epoch 400, Loss: 0.1224
Epoch 500, Loss: 0.1224
Epoch 600, Loss: 0.1224
Epoch 700, Loss: 0.1224
Epoch 800, Loss: 0.1224
Epoch 900, Loss: 0.1224


## Make Predictions

In [117]:
# Predict on test data
predictions = nn.forward(X_test)
predictions = (predictions > 0.5).astype(int)  # Convert probabilities to 0 or 1

# Evaluate accuracy
accuracy = np.mean(predictions.flatten() == y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8682
