## Build your own Project
## Email Search AI

Develop a generative search system for emails that helps organisation find and validate past decisions, stratagies, and data in a huge corpus of email threads. Here is a dataset from Kaggle that might be useful for this endeavour. A dataset from Kaggle is used.

## Loading datasets andimporting libraries

In [None]:
import pandas as pd
import ast
import re
import os

In [None]:
details = pd.read_csv('/Applications/course/Upgrad/AI&ML/gen ai/module 11-HelpmateAI/dataset_email_thread/CSV/email_thread_details.csv')
summaries = pd.read_csv('/Applications/course/Upgrad/AI&ML/gen ai/module 11-HelpmateAI/dataset_email_thread/CSV/email_thread_summaries.csv')

In [None]:
details.head()

In [None]:
summaries.head()

In [None]:
# Convert timestamps and clean 'to' column
details['timestamp'] = pd.to_datetime(details['timestamp'], errors='coerce')
details['to'] = details['to'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [None]:
# Clean up email body content
def clean_email_body(text):
    text = re.sub(r"(-{2,}|={2,}|Original Message|Forwarded by|From:.*\n)", "", text)
    text = re.sub(r"\n{2,}", "\n", text)  # Limit consecutive newlines
    text = re.sub(r"\s{2,}", " ", text)  # Remove excessive whitespace
    return text.strip()

details['cleaned_body'] = details['body'].apply(clean_email_body)

In [None]:
# Merge summaries with email details
merged_df = pd.merge(details, summaries, on='thread_id', how='left')
merged_df['search_text'] = merged_df['cleaned_body'] + " " + merged_df['summary'].fillna("")

In [None]:
merged_df.head()

In [None]:
# Display basic dataset information
print("Basic Information of Email Details Dataset:")
print(details.info())
print("\nBasic Information of Email Summaries Dataset:")
print(summaries.info())

# Check sample data
print("\nSample of Email Merged Data:")
print(merged_df[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']].head())

## Embedding Layers

In [None]:
from sentence_transformers import SentenceTransformer

# Define paths
CACHE_FOLDER = './model_cache'
EMBEDDING_FILE = 'emails_with_embeddings.pkl'

# Load model with persistent cache folder
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder=CACHE_FOLDER)

# Load from file if embeddings already exist
if os.path.exists(EMBEDDING_FILE):
    merged_df = pd.read_pickle(EMBEDDING_FILE)
    print(" Loaded embeddings from cache.")
else:
    # Generate embeddings for each email's search text
    merged_df['embedding'] = merged_df['search_text'].apply(lambda x: model.encode(x).tolist())

    # Save the dataframe with embeddings
    merged_df.to_pickle(EMBEDDING_FILE)
    print(" Embeddings computed and saved.")

# Display sample embeddings
print("\nSample Embeddings:")
print(merged_df[['search_text', 'embedding']].head())


## Search Layer Sementaics search

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to perform semantic search
def find_relevant_emails(search_query, num_results=5):
    query_vector = model.encode(search_query)
    similarity_scores = cosine_similarity([query_vector], merged_df['embedding'].tolist()).flatten()
    top_matches = similarity_scores.argsort()[-num_results:][::-1]
    return merged_df.iloc[top_matches][['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']]

# Test the semantic search function
print("\nSample Search Results:")
print(find_relevant_emails("termination meeting"))

## Entity-Based Search

In [None]:
import spacy

# Load NLP model for NER
nlp = spacy.load("en_core_web_sm")

# Entity extraction function
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply entity extraction
merged_df['entities'] = merged_df['body'].apply(extract_entities)

# Function to search by entity
def search_by_entity(entity_name):
    results = merged_df[merged_df['entities'].apply(lambda ents: any(entity_name in ent for ent in ents))]
    return results[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']]

# Test entity search function
print("\nSample Entity-Based Search Results:")
print(search_by_entity("Jeffrey"))


## Topic Based Search

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize text data for LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(merged_df['search_text'])

# Apply Latent Dirichlet Allocation for topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Assign topics to each email
merged_df['topic'] = lda.transform(doc_term_matrix).argmax(axis=1)

# Function to filter emails by topic
def search_by_topic(topic_num):
    return merged_df[merged_df['topic'] == topic_num][['thread_id', 'subject', 'timestamp', 'search_text']]

# Test topic-based search function
print("\nSample Topic-Based Search Results:")
print(search_by_topic(4))

## Generation Layer

In [None]:
import torch
from transformers import pipeline

# Use GPU if available
device = 0 if torch.cuda.is_available() else -1

# Initialize summarizer with a more optimized pipeline
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",  # consider t5-small if speed is more important than quality
    device=device
)

def batch_summarize_fast(texts, max_length=50, min_length=10, chunk_size=16):
    """
    Efficient batch summarizer using Huggingface pipeline with GPU acceleration and larger input size.
    """
    summaries = []
    for i in range(0, len(texts), chunk_size):
        batch = texts[i:i+chunk_size]
        
        # Truncate inputs based on token length, not character length (512 chars is too conservative)
        batch = [text if isinstance(text, str) else "" for text in batch]

        try:
            result = summarizer(batch, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.extend([res['summary_text'] for res in result])
        except Exception as e:
            summaries.extend([f"Error: {str(e)}" for _ in batch])

    return summaries

# Apply function directly to the DataFrame
email_bodies = merged_df['body'].tolist()
summaries = batch_summarize_fast(email_bodies, max_length=60, min_length=15, chunk_size=32)  # Increase batch size if GPU allows

# Store results
merged_df['generated_summary'] = summaries

# Display sample summaries
print("\nGenerated Summaries:")
print(merged_df[['thread_id', 'subject', 'generated_summary']].head())


In [None]:
details.subject.unique().tolist()

In [None]:
# Define the sample queries for testing
queries = [
    "New Address",                              # Query 1
    "Citizens request for proposal",            # Query 2
    "RE: What's Up"                             # Query 3
]

# Function to test each query in the Search Layer and Generation Layer separately
def test_query_for_screenshots(query, top_k=3):
    print(f"\nTesting for Query: {query}")
    print("="*60)

    # Search Layer - Retrieve top 3 results
    print("\n**Top 3 Results from Search Layer (Semantic Search):**")
    semantic_results = find_relevant_emails(query, top_k)
    print(semantic_results[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']])

    # Capture screenshot here for Search Layer

    # Generation Layer - Generate a summary based on the top results
    # Combine the top search results for summarization
    combined_text = " ".join(semantic_results['search_text'].tolist())
    generated_summary = batch_summarize([combined_text])[0]
    print("\n**Final Generated Answer from Generation Layer:**")
    print(generated_summary)

    # Capture screenshot here for Generation Layer
    print("="*60)

# Run tests for each query and capture screenshots
for query in queries:
    test_query_for_screenshots(query)

In [None]:
# Define the sample queries for testing
queries = [
    "New Address",                              # Query 1
    "Citizens request for proposal",            # Query 2
    "RE: What's Up"                             # Query 3
]

# Output 1 - Top 3 Results from Search Layer for Query 1

In [None]:
print(f"\nTesting for Query: {queries[0]}")
print("="*60)

# Search Layer - Retrieve top 3 results for Query 1
print("\n**Top 3 Results from Search Layer (Semantic Search):**")
semantic_results_1 = find_relevant_emails(queries[0], num_results=3)
print(semantic_results_1[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']])

# Output 2 - Final Generated Answer from Generation Layer for Query 1

In [None]:
combined_text_1 = " ".join(semantic_results_1['search_text'].tolist())
generated_summary_1 = batch_summarize([combined_text_1])[0]
print("\n**Final Generated Answer from Generation Layer:**")
print(generated_summary_1)

# Output 3 - Top 3 Results from Search Layer for Query 2

In [None]:
print(f"\nTesting for Query: {queries[1]}")
print("="*60)

# Search Layer - Retrieve top 3 results for Query 2
print("\n**Top 3 Results from Search Layer (Semantic Search):**")
semantic_results_2 = find_relevant_emails(queries[1], num_results=3)
print(semantic_results_2[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']])

# Output 4 - Final Generated Answer from Generation Layer for Query 2

In [None]:
combined_text_2 = " ".join(semantic_results_2['search_text'].tolist())
generated_summary_2 = batch_summarize([combined_text_2])[0]
print("\n**Final Generated Answer from Generation Layer:**")
print(generated_summary_2)

# Output 5 - Top 3 Results from Search Layer for Query 3

In [None]:
print(f"\nTesting for Query: {queries[2]}")
print("="*60)

# Search Layer - Retrieve top 3 results for Query 3
print("\n**Top 3 Results from Search Layer (Semantic Search):**")
semantic_results_3 = find_relevant_emails(queries[2], num_results=3)
print(semantic_results_3[['thread_id', 'subject', 'timestamp', 'from', 'to', 'search_text']])

# Output 6 - Final Generated Answer from Generation Layer for Query 3

In [None]:
combined_text_3 = " ".join(semantic_results_3['search_text'].tolist())
generated_summary_3 = batch_summarize([combined_text_3])[0]
print("\n**Final Generated Answer from Generation Layer:**")
print(generated_summary_3)