#### Importing all the libraries

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models

#### loading the dataset

In [18]:
data=pd.read_csv(r"C:\Users\DELL\Downloads\archive.zip")

In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


#### handling missing values

In [21]:
# Checking for missing values
print("Number of missing values:")
print(data.isnull().sum())


Number of missing values:
Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64


In [23]:

# Handling duplicates
print("\nNumber of duplicate rows:", data.duplicated().sum())



Number of duplicate rows: 0


In [25]:
#since categorical values dont have much outliers we just handle missing values
# Handling missing values
data.dropna(inplace=True)


In [27]:
# Confirming if duplicates are removed

print("Number of duplicate rows:", data.duplicated().sum())


Number of duplicate rows: 0


In [28]:
# Handling missing values
data.dropna(inplace=True)

#### splitting the data

In [30]:
X = data['text']
y = data['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the training and test sets
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Training set size: 4136
Test set size: 1035


#### building tensor flow model

In [31]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [55]:
#tokenisation and padding
#vectorising the data into integers and ensuring same length sequence
tokenizer = Tokenizer(oov_token="<OOV>")#out of vocabulary words
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)#text is converted to a sequence of integers


In [56]:
#padding
max_length = 100  # Assuming a maximum sequence length of 100

X_train_padded = pad_sequences(sequences_train, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(sequences_test, maxlen=max_length, padding='post', truncating='post')


In [36]:
embedding_dim = 50  # Assuming an embedding dimension of 50

# Build the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, validation_split=0.2)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 189ms/step - accuracy: 0.6976 - loss: 0.5973 - val_accuracy: 0.7983 - val_loss: 0.3448
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 178ms/step - accuracy: 0.8101 - loss: 0.3378 - val_accuracy: 0.7923 - val_loss: 0.3437
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 180ms/step - accuracy: 0.7783 - loss: 0.3962 - val_accuracy: 0.7101 - val_loss: 0.6000
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 177ms/step - accuracy: 0.7060 - loss: 0.5860 - val_accuracy: 0.9251 - val_loss: 0.2473
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 178ms/step - accuracy: 0.9246 - loss: 0.2336 - val_accuracy: 0.8539 - val_loss: 0.4158


<keras.src.callbacks.history.History at 0x20bea3cd5b0>

#### Model evaluation

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [38]:
#converting the text data (X_test) into sequences of integers using the same tokenizer
sequences_test = tokenizer.texts_to_sequences(X_test)


In [39]:
# ensures that each sequence has a fixed length by adding padding tokens at the end of each sequence 
X_test_padded = pad_sequences(sequences_test, maxlen=max_length, padding='post', truncating='post')


In [41]:
#trained model to make predictions on the preprocessed test data 
y_pred = model.predict_classes(X_test_padded)
# Convert probabilities to class labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)

AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [44]:
# Predict probabilities for each class
y_pred_prob = model.predict(X_test_padded)

# Convert probabilities to class labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step


In [46]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


In [47]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.855072463768116
Precision: 0.9281437125748503
Recall: 0.5290102389078498
F1 Score: 0.6739130434782609
Confusion Matrix:
[[730  12]
 [138 155]]


#### True Positives (730): Spam emails correctly identified as spam.
#### True Negatives (155): Ham emails correctly identified as ham.
#### False Positives (12): Ham emails incorrectly classified as spam.
#### False Negatives (138): Spam emails incorrectly classified as ham.

In [54]:
# Assuming 'input_text' contains the input email text
input_text = "Your input email text here"

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([input_text])

# Pad the input sequence to match the maximum sequence length
max_length = 5916  # Assuming the maximum sequence length is 5916
input_padded = pad_sequences(input_sequence, maxlen=max_length, padding='post')

# Predict the probability of spam
probability = model.predict(input_padded)[0][0]
print(f"Probability of spam: {probability}")

# Classify the email as spam or ham
threshold = 0.5
if probability > threshold:
    print("The email is spam.")
else:
    print("The email is ham.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Probability of spam: 0.9598574638366699
The email is spam.
