In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import string

In [None]:
nltk.download('punkt')

In [None]:
df = pd.read_csv('/kaggle/input/email-spam-detection-dataset-classification/spam.csv', encoding='latin-1')

In [None]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.rename(columns={"v1": "Category", "v2": "Text"})
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1}).astype(int)

In [None]:
print(df['Category'].value_counts())

In [None]:
l = []
for i in df['Text']:
    l.append(len(i))
max(l)

In [None]:
def preprocess_text(text):
    # Tokenize and remove punctuation
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation and non-alphabetic characters
    return tokens

In [None]:
df['tokens'] = df['Text'].apply(preprocess_text)

In [None]:
df.head()

In [None]:
model_w2v = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)


print(model_w2v)

In [None]:
def get_word2vec_embeddings(tokens):
    embeddings = []
    for word in tokens:
        if word in model_w2v.wv:
            embeddings.append(model_w2v.wv[word])
        else:
            
            embeddings.append([0] * 100) 
    return embeddings


df['embeddings'] = df['tokens'].apply(get_word2vec_embeddings)


print(df['embeddings'].head())

In [None]:
max_length = 1000  


X = pad_sequences(df['embeddings'], maxlen=max_length, padding='post', truncating='post', dtype='float32')


print(X.shape)

In [None]:
y = df['Category'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
y

In [None]:

model = Sequential()


model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(max_length, 100)))

model.add(Bidirectional(LSTM(64)))


model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(12, 6))

In [None]:

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
model.save('/kaggle/working/spam_classifier_model.h5')

In [None]:
model_w2v.save('/kaggle/working/word2vec_model.model')