# Word2Vec Embeddings as Input

In [None]:
import numpy as np
import pickle
import os

In [None]:
data_path = '/kaggle/input/nlp-a3-word2vec'

# Load word embeddings
with open(os.path.join(data_path, 'word_embeddings.pkl'), 'rb') as f:
    word_embeddings = pickle.load(f)

# Load vocabulary
with open(os.path.join(data_path, 'vocabulary.pkl'), 'rb') as f:
    vocabulary = pickle.load(f)

# Load word2idx mapping
with open(os.path.join(data_path, 'word2idx.pkl'), 'rb') as f:
    word2idx = pickle.load(f)

# Load idx2word mapping
with open(os.path.join(data_path, 'idx2word.pkl'), 'rb') as f:
    idx2word = pickle.load(f)

In [None]:
print(f"Loaded {len(word_embeddings)} word embeddings.")
print(f"Vocabulary size: {len(vocabulary)}")

# Email Dataset

## Libraries

In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')

df.head()

In [None]:
df['label'].value_counts()

## Data Preprocessing

### Remove missing values

In [None]:
# Identify the row with the missing email
missing_email_index = df[df['email'].isnull()].index

# Drop the row
df = df.drop(missing_email_index)

### Balancing Dataset

In [None]:
# Separate spam and not spam messages
spam_df = df[df['label'] == 1]
not_spam_df = df[df['label'] == 0]

### Undersampling Not Spam messages

In [None]:
# Randomly sample 500 not spam messages
not_spam_sampled_df = not_spam_df.sample(n=500, random_state=42)

### Combine balanced dataset

In [None]:
# Concatenate the spam messages with the sampled not spam messages
balanced_df = pd.concat([spam_df, not_spam_sampled_df])

# Shuffle the dataset to mix spam and not spam messages
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

### Preprocessing

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = text.split()
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Remove words not in the vocabulary
    tokens = [word for word in tokens if word in vocabulary]
    return tokens

In [None]:
# Apply preprocessing to the 'email' column
balanced_df['tokens'] = balanced_df['email'].apply(preprocess_text)

In [None]:
# View the first few tokenized emails
balanced_df[['email', 'tokens']].head()

# Vectorizing Emails Using Word2Vec Embeddings

In [None]:
embedding_size = 10  # As per your assignment
max_email_length = 12  # As specified in your assignment

# def vectorize_email(email_tokens, word_embeddings):
#     vectors = [word_embeddings[word] for word in email_tokens if word in word_embeddings]
#     if vectors:
#         return np.mean(vectors, axis=0)  # Shape: (embedding_size,)
#     else:
#         return np.zeros(embedding_size)

def vectorize_email(email_tokens, word_embeddings):
    vectors = []
    for word in email_tokens[:max_email_length]:
        if word in word_embeddings:
            vectors.append(word_embeddings[word])
        else:
            # Use a zero vector for unknown words (should be minimal due to preprocessing)
            vectors.append(np.zeros(embedding_size))
    # Pad with zero vectors if necessary
    while len(vectors) < max_email_length:
        vectors.append(np.zeros(embedding_size))
    return np.array(vectors)  # Shape: (max_email_length, embedding_size)

### Preparing Labels

In [None]:
import numpy as np

# Assuming 'tokens' column contains the preprocessed tokens
X = np.array([vectorize_email(tokens, word_embeddings) for tokens in balanced_df['tokens']])

In [None]:
y = balanced_df['label'].astype(int).values  # Shape: (num_samples,)

# Neural Network

### Split data for training and test

In [None]:
from sklearn.model_selection import train_test_split

# Split the data (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Flatten the input samples
X_train_flat = X_train.reshape(X_train.shape[0], -1)  # Shape: (num_train_samples, max_email_length * embedding_size)
X_test_flat = X_test.reshape(X_test.shape[0], -1)     # Shape: (num_test_samples, max_email_length * embedding_size)

### Initialization

In [None]:
# Number of input features
input_size = max_email_length * embedding_size  # 12 * 10 = 120

# Initialize weights and biases
np.random.seed(42)  # For reproducibility

hidden_layer_size = 8  # Increased from 8

# Update weight and bias initializationsW1 = np.random.randn(input_size, 8) * np.sqrt(2 / input_size)
W2 = np.random.randn(input_size, 8) * np.sqrt(2 / input_size)

# W1 = np.random.randn(input_size, hidden_layer_size) * 0.01
b1 = np.zeros((1, hidden_layer_size))

# W2 = np.random.randn(input_size, hidden_layer_size) * 0.01
b2 = np.zeros((1, hidden_layer_size))

V = np.random.randn(hidden_layer_size * 2, 1) * 0.01
c = np.zeros((1, 1))

### Activiation Functions

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

### Forward Propagation

In [None]:
def forward_propagation(X_sample):
    # Hidden node 1
    Z1 = np.dot(X_sample, W1) + b1
    A1 = relu(Z1)
    
    # Hidden node 2
    Z2 = np.dot(X_sample, W2) + b2
    A2 = relu(Z2)
    
    # Concatenate activations
    A_hidden = np.concatenate((A1, A2), axis=1)
    
    # Output node
    Z_output = np.dot(A_hidden, V) + c
    A_output = sigmoid(Z_output)
    
    cache = {
        'X_sample': X_sample,
        'Z1': Z1, 'A1': A1,
        'Z2': Z2, 'A2': A2,
        'A_hidden': A_hidden,
        'Z_output': Z_output, 'A_output': A_output
    }
    return A_output, cache

### Backward Propagation

In [None]:
def backward_propagation(y_true, cache):
    # Retrieve cached values
    X_sample = cache['X_sample']
    Z1 = cache['Z1']
    A1 = cache['A1']
    Z2 = cache['Z2']
    A2 = cache['A2']
    A_hidden = cache['A_hidden']
    Z_output = cache['Z_output']
    A_output = cache['A_output']
    
    # Output layer gradients
    dZ_output = A_output - y_true.reshape(-1, 1)
    dV = np.dot(A_hidden.T, dZ_output)
    dc = np.sum(dZ_output, axis=0, keepdims=True)
    
    dA_hidden = np.dot(dZ_output, V.T)
    
    # Split gradients
    dA1 = dA_hidden[:, :hidden_layer_size]
    dA2 = dA_hidden[:, hidden_layer_size:]
    
    dZ1 = dA1 * relu_derivative(Z1)
    dZ2 = dA2 * relu_derivative(Z2)
    
    dW1 = np.dot(X_sample.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)
    
    dW2 = np.dot(X_sample.T, dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)
    
    gradients = {
        'dW1': dW1, 'db1': db1,
        'dW2': dW2, 'db2': db2,
        'dV': dV, 'dc': dc
    }

        # In backward_propagation function, after computing gradients
    gradients['dW1'] += reg_strength * parameters['W1']
    gradients['dW2'] += reg_strength * parameters['W2']
    gradients['dV'] += reg_strength * parameters['V']
    
    return gradients

### Update Parameters

In [None]:
def update_parameters(params, grads, learning_rate):
    params['W1'] -= learning_rate * grads['dW1']
    params['b1'] -= learning_rate * grads['db1']
    params['W2'] -= learning_rate * grads['dW2']
    params['b2'] -= learning_rate * grads['db2']
    params['V'] -= learning_rate * grads['dV']
    params['c'] -= learning_rate * grads['dc']
    return params

### Training Loop

In [None]:
# Pack parameters into a dictionary
parameters = {
    'W1': W1, 'b1': b1,
    'W2': W2, 'b2': b2,
    'V': V, 'c': c
}

# Training hyperparameters
num_epochs = 100
initial_lr = 0.01
decay = 0.001

num_samples = X_train_flat.shape[0]

for epoch in range(num_epochs):
    total_loss = 0
    for i in range(num_samples):
        X_sample = X_train_flat[i].reshape(1, -1)  # Shape: (1, input_size)
        y_sample = y_train[i]  # Scalar

        # Forward propagation
        A_output, cache = forward_propagation(X_sample)

        # Compute loss (binary cross-entropy)
        loss = - (y_sample * np.log(A_output + 1e-8) + (1 - y_sample) * np.log(1 - A_output + 1e-8))
        # Inside the training loop, after computing the loss
        reg_strength = 0.001  # Adjust as needed
        loss = loss + (reg_strength / 2) * (
            np.sum(np.square(parameters['W1'])) +
            np.sum(np.square(parameters['W2'])) +
            np.sum(np.square(parameters['V']))
        )

        total_loss += loss

        # Backward propagation
        gradients = backward_propagation(np.array([y_sample]), cache)

        # Update parameters
        learning_rate = initial_lr / (1 + decay * epoch)
        parameters = update_parameters(parameters, gradients, learning_rate)

    average_loss = total_loss / num_samples
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss[0][0]}")

### Evaluating Model

In [None]:
def predict(X, parameters):
    predictions = []
    for i in range(len(X)):
        X_sample = X[i].reshape(1, -1)  # Shape: (1, input_size)
        A_output, _ = forward_propagation(X_sample)
        prediction = 1 if A_output >= 0.5 else 0
        predictions.append(prediction)
    return np.array(predictions)

In [None]:
# Import necessary metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Predictions on test set
predictions_test = predict(X_test_flat, parameters)

# Calculate metrics
accuracy = accuracy_score(y_test, predictions_test)
precision = precision_score(y_test, predictions_test)
recall = recall_score(y_test, predictions_test)
f1 = f1_score(y_test, predictions_test)

# Print the metrics
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Compute confusion matrix
cm = confusion_matrix(y_test, predictions_test)

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

# Visualize confusion matrix
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Spam', 'Spam'])
disp.plot(ax=ax)
plt.title('Confusion Matrix')
plt.show()

### Classification Threshold

In [None]:
def predict_probabilities(X, parameters):
    probabilities = []
    for i in range(len(X)):
        X_sample = X[i].reshape(1, -1)  # Shape: (1, input_size)
        A_output, _ = forward_propagation(X_sample)
        probabilities.append(A_output[0][0])
    return np.array(probabilities)  # Shape: (num_samples,)

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, roc_thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

In [None]:
# Step 1: Get predicted probabilities
y_scores = predict_probabilities(X_test_flat, parameters)

# Step 2: Define thresholds
thresholds = np.linspace(0, 1, 101)

# Step 3: Compute precision, recall, and F1 score for each threshold
precision_scores = []
recall_scores = []
f1_scores = []

for threshold in thresholds:
    predictions = (y_scores >= threshold).astype(int)
    precision = precision_score(y_test, predictions, zero_division=0)
    recall = recall_score(y_test, predictions, zero_division=0)
    f1 = f1_score(y_test, predictions, zero_division=0)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Step 4: Plot Precision-Recall-F1 vs. Threshold
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision_scores, label='Precision', color='blue')
plt.plot(thresholds, recall_scores, label='Recall', color='green')
plt.plot(thresholds, f1_scores, label='F1 Score', color='red')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1 Score vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

# Step 5: Find the optimal threshold based on maximum F1 score
best_f1_index = np.argmax(f1_scores)
optimal_threshold = thresholds[best_f1_index]
print(f'Optimal Threshold based on F1 Score: {optimal_threshold:.2f}')

# Step 6: Make predictions using the optimal threshold
predictions_adjusted = (y_scores >= optimal_threshold).astype(int)

# Step 7: Evaluate the model with the adjusted threshold
accuracy_adjusted = accuracy_score(y_test, predictions_adjusted)
precision_adjusted = precision_score(y_test, predictions_adjusted)
recall_adjusted = recall_score(y_test, predictions_adjusted)
f1_adjusted = f1_score(y_test, predictions_adjusted)
cm_adjusted = confusion_matrix(y_test, predictions_adjusted)

print(f"Adjusted Test Accuracy: {accuracy_adjusted * 100:.2f}%")
print(f"Adjusted Precision: {precision_adjusted:.2f}")
print(f"Adjusted Recall: {recall_adjusted:.2f}")
print(f"Adjusted F1 Score: {f1_adjusted:.2f}")
print("Adjusted Confusion Matrix:")
print(cm_adjusted)

# Step 8: Plot the ROC Curve
fpr, tpr, roc_thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()