In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [2]:
file_path = '/content/drive/MyDrive/Neuraaal/Assign_5/email_classification.csv'
data = pd.read_csv(file_path)

In [4]:
data

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham
...,...,...
174,We're pleased to inform you that your refund h...,ham
175,Get rich quick! Invest in our revolutionary ne...,spam
176,Your free trial period is ending soon. Upgrade...,ham
177,Your order is on its way! Track your shipment ...,ham


In [5]:
texts = data['email'].values
labels = data['label'].values

In [6]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

In [9]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)  # Use top 5000 words
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [11]:
# Pad sequences to ensure uniform input length
max_sequence_length = 100  # Truncate or pad to 100 words
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

In [12]:
# Define the RNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_sequence_length),  # Embedding layer
    SimpleRNN(64, activation='relu'),  # Simple RNN layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])



In [13]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.summary()

In [16]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - accuracy: 0.9900 - loss: 0.0896 - val_accuracy: 0.8889 - val_loss: 0.2548
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 82ms/step - accuracy: 0.9868 - loss: 0.0529 - val_accuracy: 0.8611 - val_loss: 0.2748
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 1.0000 - loss: 0.0498 - val_accuracy: 0.9167 - val_loss: 0.2110
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 1.0000 - loss: 0.0192 - val_accuracy: 0.8889 - val_loss: 0.2345
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0184 - val_accuracy: 0.8889 - val_loss: 0.2416
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0044 - val_accuracy: 0.8889 - val_loss: 0.2270
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━

In [17]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8843 - loss: 0.2522
Test Loss: 0.24218140542507172
Test Accuracy: 0.8888888955116272
