In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade keras 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import string 
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split 


In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix 

In [None]:
!pip install imbalanced-learn 
from imblearn.over_sampling import RandomOverSampler

In [None]:
ham = pd.read_csv('/kaggle/input/sodone/Ham 2.csv')
spam = pd.read_csv('/kaggle/input/sodone/Spam 2.csv')

In [None]:
ham['text'] = ham["From"] + ' ' + ham["Subject"] + " " + ham["Snippet"]
ham = ham.drop(columns=["From", "Subject", "Snippet", "ID", "Thread", "Date", "To", "Labels", "Link"])

In [None]:
spam['text'] = spam["From"] + ' ' + spam["Subject"] + " " + spam["Snippet"]
spam = spam.drop(columns=["From", "Subject", "Snippet", "ID", "Thread", "Date", "To", "Labels", "Link"])

In [None]:
ham['label'] = 0 
spam['label'] = 1

In [None]:
combined = pd.concat([ham, spam], ignore_index=True)

In [None]:
combined

# Separate the data into features and labels 

In [None]:
features = combined["text"]
labels = combined["label"]

In [None]:
# Remove stopwords 
def remove_stopwords(text):
    #print(text)
    stop_words = stopwords.words('english')
    star_words = []
    
    #store important words
    for word in str(text).split():
            word = word.lower()
            if word not in stop_words:
                star_words.append(word)
        #print(star_words)
    output = " ".join(star_words)
    return output

In [None]:
combined["text"] = combined["text"].apply(lambda row: remove_stopwords(row))
print(features)
combined.head(6)

# Remove punctuations 

In [None]:
punctuations_list = string.punctuation 
def remove_punctuations(text):
    for email in text: 
        email = str.maketrans('', '', punctuations_list)
        #print(text.translate(email))
    return text.translate(email)

In [None]:
combined["text"] = combined["text"].apply(lambda row: remove_punctuations(row))
combined

In [None]:
combined["label"].value_counts()

# Balance the dataset 

In [None]:
ros = RandomOverSampler(random_state = 42)
features_resampled, labels_resampled = ros.fit_resample(combined[["text"]], combined["label"])

In [None]:
labels_resampled.value_counts()

In [None]:
#Convert resampled features back into a DataFrame 
features_resampled = pd.DataFrame(features_resampled, columns=["text"])
labels_resampled = pd.DataFrame(labels_resampled, columns=["label"])
resampled = pd.concat([features_resampled, labels_resampled], axis=1)

In [None]:
resampled

# Word2Vec Conversion 

In [None]:
train, test = train_test_split(resampled, test_size = 0.2, random_state = 42)

In [None]:
train_X = train["text"]
train_Y = train["label"]
test_X = test["text"]
test_Y = test["label"]

In [None]:
print(train_Y.value_counts())

In [None]:
print(test_Y.value_counts())

In [None]:
#Tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_X)

#Convert text to sequences 
train_sequences = tokenizer.texts_to_sequences(train_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

In [None]:
print(test_sequences)

In [None]:
max_len = 100 
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [None]:
print(train_sequences.shape)
test_sequences.shape

In [None]:
print(train_sequences)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_len))
model.add(tf.keras.layers.LSTM(128))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
vocab = len(tokenizer.word_index) + 1
print(vocab)

In [None]:
history = model.fit(train_sequences, train_Y, epochs=10,batch_size=32, validation_split=0.2)

In [None]:
model.summary()

In [None]:
test_loss, test_accuracy = model.evaluate(test_sequences, test_Y)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")