<a href="https://colab.research.google.com/github/malgavesaurabh/Python-Projects/blob/main/EmailClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install faker

Collecting faker
  Downloading Faker-19.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-19.2.0


In [None]:
import csv
from faker import Faker
import os

# Install the faker library if not already installed
try:
    import faker
except ImportError:
    !pip install Faker
    import faker

# Create a Faker object
fake = Faker()

# Number of email samples you want to generate (6000 in this case)
num_samples = 6000

# Function to generate a random email body for unknown category
def generate_unknown_email():
    email_body = f"Subject: {fake.catch_phrase()}\n\n"
    email_body += f"Dear {fake.name()},\n\n"
    email_body += f"I hope this email finds you well. We are reaching out to you to introduce our company and services. We believe that our offerings can add significant value to your business.\n\n"
    email_body += f"{fake.paragraph()}\n\n"
    email_body += f"If you are interested in learning more, please reply to this email or visit our website: {fake.url()}\n\n"
    email_body += "Thank you for your time and consideration.\n\n"
    email_body += "Best regards,\n"
    email_body += f"{fake.company()}\n"
    email_body += f"{fake.email()}"
    return email_body

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Replace "/content/drive/MyDrive/DataSets" with the desired folder path in your Google Drive
folder_path = "/content/drive/MyDrive/DataSets"

# Create the "DataSets" folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Update the file path to save the CSV file in your Google Drive
file_path = os.path.join(folder_path, "unknown_dataset.csv")

# Generate the unknown email samples and save them to a CSV file
with open(file_path, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["Label", "Email Body"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i in range(num_samples):
        email_body = generate_unknown_email()
        writer.writerow({"Label": "Unknown", "Email Body": email_body})

print("6000 unknown email samples have been generated and saved to 'unknown_dataset.csv'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
6000 unknown email samples have been generated and saved to 'unknown_dataset.csv'.


In [None]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
import joblib

# Function to clean email bodies
def clean_email_bodies(email_bodies):
    cleaned_bodies = []
    for body in email_bodies:
        # Remove the subject (if present)
        body = re.sub(r"Subject:.*", "", body, flags=re.IGNORECASE)
        # Remove the signature (if present)
        body = re.sub(r"--.*", "", body, flags=re.DOTALL)
        # Remove email addresses
        body = re.sub(r"\S+@\S+", "", body)
        # Remove phone numbers
        body = re.sub(r"\d{10,}", "", body)
        # Remove extra whitespaces
        body = re.sub(r"\s+", " ", body)
        cleaned_bodies.append(body.strip())
    return cleaned_bodies

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# File path to save the trained model in Google Drive
model_save_path = '/content/drive/MyDrive/Models/email_classification_model.h5'
# File path to save the label encoder in Google Drive
label_encoder_save_path = '/content/drive/MyDrive/Models/label_encoder.joblib'

# File path to the dataset in Google Drive
dataset_path = '/content/drive/MyDrive/DataSets/DemoDataSet.csv'

# Step 1: Load and preprocess the dataset
data = pd.read_csv(dataset_path)
labels = data['Label']
email_bodies = data['Email Body']

# Clean the email bodies
email_bodies_cleaned = clean_email_bodies(email_bodies)

# Encode the labels to integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Save the label encoder
joblib.dump(label_encoder, label_encoder_save_path)

# Tokenize the text data and convert to numerical representation
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(email_bodies_cleaned)
sequences = tokenizer.texts_to_sequences(email_bodies_cleaned)
max_sequence_length = max(len(sequence) for sequence in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Step 2: Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.2, random_state=42)

# Step 3: Build the CNN model
embedding_dim = 100
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer to reduce overfitting
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer with softmax for multiple classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 4: Train the model on the training data
batch_size = 32
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Step 5: Save the trained model
model.save(model_save_path)
print(f"Model saved at: {model_save_path}")

# Step 6: Evaluate the model on the testing data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")



KeyboardInterrupt: ignored

In [None]:
import pandas as pd
import re
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
import numpy as np

# Function to clean email bodies
def clean_email_body(email_body):
    # Remove the subject (if present)
    body = re.sub(r"Subject:.*", "", email_body, flags=re.IGNORECASE)
    # Remove the signature (if present)
    body = re.sub(r"--.*", "", body, flags=re.DOTALL)
    # Remove email addresses
    body = re.sub(r"\S+@\S+", "", body)
    # Remove phone numbers
    body = re.sub(r"\d{10,}", "", body)
    # Remove extra whitespaces
    body = re.sub(r"\s+", " ", body)
    return body.strip()

# File path to the saved model in Google Drive
model_path = '/content/drive/MyDrive/Models/email_classification_model.h5'
# File path to the label encoder in Google Drive
label_encoder_path = '/content/drive/MyDrive/Models/label_encoder.joblib'

# Load the trained model
model = load_model(model_path)

# Load the label encoder
label_encoder = joblib.load(label_encoder_path)

# Get user input for email body
new_email_body = input("Enter the body of the email: ")
cleaned_email_body = clean_email_body(new_email_body)

# Preprocess the new data
sequences = tokenizer.texts_to_sequences([cleaned_email_body])
X_new = pad_sequences(sequences, maxlen=max_sequence_length)

# Make predictions using the model
predictions = model.predict(X_new)

# Decode the predictions to their original labels and confidence scores
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))
confidence_scores = np.max(predictions, axis=1)

# Print the prediction and confidence score
print(f"Predicted Label: {predicted_labels[0]}")
print(f"Confidence Score: {confidence_scores[0]:.4f}")


Enter the body of the email: can you verify the below employee
Predicted Label: Employment Verification
Confidence Score: 0.5291
