In [1]:
# Import the libraries
import os
import re
import pandas as pd
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
!pip3 install -r requirements.txt

Collecting pandas==2.2.2 (from -r requirements.txt (line 1))

  You can safely remove it manually.
  You can safely remove it manually.



  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting tqdm==4.66.4 (from -r requirements.txt (line 3))
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting scikit-learn==1.5.0 (from -r requirements.txt (line 4))
  Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting sentence-transformers==2.6.1 (from -r requirements.txt (line 7))
  Using cached sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting transformers==4.42.1 (from -r requirements.txt (line 8))
  Using cached transformers-4.42.1-py3-none-any.whl.metadata (43 kB)
Collecting chromadb==0.5.0 (from -r requirements.txt (line 11))
  Using cached chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers==2.6.1->-r requirements.txt (line 7))
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.42.1->-r requir

In [13]:
#Let's check the version of OpenAI
import openai
print(openai.__version__)

1.95.1


In [3]:
# Read key from text file
with open("openai_key.txt", "r") as f:
    api_key = f.read().strip()

In [5]:
# Pass key to OpenAI client
from openai import OpenAI
client = OpenAI(api_key=api_key)

In [7]:
# Read the input dataset
df_email_thread = pd.read_csv("email_dataset/email_threads.csv")
df_email_thread.head()

Unnamed: 0,thread_id,subject,from,to,timestamp,body
0,1001,Project Falcon Delay,alice@example.com,bob@example.com,2025-07-01 09:15:00,"Hi Bob,\nWe are experiencing delays in Project..."
1,1001,Project Falcon Delay,bob@example.com,alice@example.com,2025-07-01 10:00:00,Thanks for the update. Can you send revised ti...
2,1001,Project Falcon Delay,alice@example.com,bob@example.com,2025-07-01 11:30:00,Sure. Revised delivery expected by July 14th.\...
3,1002,Q3 Marketing Budget,carol@example.com,team@example.com,2025-06-15 14:30:00,Finance approved 10% increase in marketing for...
4,1002,Q3 Marketing Budget,dave@example.com,team@example.com,2025-06-15 15:00:00,Thanks Carol. Please proceed accordingly.\n-Dave


In [9]:
df_email_thread.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   thread_id  22 non-null     int64 
 1   subject    22 non-null     object
 2   from       22 non-null     object
 3   to         22 non-null     object
 4   timestamp  22 non-null     object
 5   body       22 non-null     object
dtypes: int64(1), object(5)
memory usage: 1.2+ KB


In [11]:
# src/embedding_layer.py

# Import the libraries
import os
import re
import pandas as pd
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

# === CLEANING FUNCTIONS ===

def clean_email_body(body: str) -> str:
    body = re.sub(r'[\r\n]+', ' ', body)  # remove newlines
    body = re.sub(r'\s+', ' ', body)  # normalize spaces
    body = re.sub(r'On .* wrote:', '', body)  # remove quoted replies
    return body.strip()


# === CHUNKING STRATEGY ===

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_tokens - overlap):
        chunk = ' '.join(words[i:i + max_tokens])
        if len(chunk.split()) > 10:
            chunks.append(chunk)
    return chunks


# === EMBEDDING + CHROMA ===

class EmbeddingProcessor:
    def __init__(self, client, chroma_path: str = "chroma_db", model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.client = client
        self.chroma_collection = self.client.get_or_create_collection(name="email_chunks")

    def process_emails(self, df: pd.DataFrame):
        all_chunks = []
        metadatas = []

        for idx, row in df.iterrows():
            cleaned_body = clean_email_body(row['body'])
            chunks = chunk_text(cleaned_body)

            for i, chunk in enumerate(chunks):
                chunk_id = f"{row['thread_id']}_{idx}_{i}"
                all_chunks.append({
                    "id": chunk_id,
                    "text": chunk,
                    "metadata": {
                        "thread_id": row["thread_id"],
                        "subject": row["subject"],
                        "from": row["from"],
                        "timestamp": row["timestamp"]
                    }
                })

        print(f"Embedding {len(all_chunks)} chunks...")
        embeddings = self.model.encode([c['text'] for c in all_chunks], show_progress_bar=True).tolist()

        self.chroma_collection.add(
            documents=[c['text'] for c in all_chunks],
            embeddings=embeddings,
            metadatas=[c['metadata'] for c in all_chunks],
            ids=[c['id'] for c in all_chunks]
        )
        

    def get_collection(self):
        return self.chroma_collection


In [13]:
# src/cache.py

import json
import os


class Cache:
    def __init__(self, cache_file: str):
        self.cache_file = cache_file
        if os.path.exists(cache_file):
            with open(cache_file, "r") as f:
                self.cache = json.load(f)
        else:
            self.cache = {}

    def contains(self, key: str) -> bool:
        return key in self.cache

    def get(self, key: str):
        return self.cache.get(key, None)

    def set(self, key: str, value):
        self.cache[key] = value
        self._save()

    def _save(self):
        with open(self.cache_file, "w") as f:
            json.dump(self.cache, f, indent=2)


In [15]:
import numpy as np

def convert_np_types(obj):
    if isinstance(obj, dict):
        return {k: convert_np_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_np_types(i) for i in obj]
    elif isinstance(obj, np.float32) or isinstance(obj, np.float64):
        return float(obj)
    elif isinstance(obj, np.int32) or isinstance(obj, np.int64):
        return int(obj)
    else:
        return obj


In [25]:
# src/search_layer.py

import hashlib
import json
import os
from typing import List, Dict

import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import chromadb
from chromadb.config import Settings

#from .cache import Cache

# === CONFIGURATION ===
CACHE_PATH = "cache/search_cache.json"
CHROMA_PATH = "chroma_db"

class SearchEngine:
    def __init__(
        self,
        client,
        embedding_model_name: str = "all-MiniLM-L6-v2",
        cross_encoder_model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    ):
        self.embedder = SentenceTransformer(embedding_model_name)
        self.reranker = CrossEncoder(cross_encoder_model_name)
        self.cache = Cache(CACHE_PATH)

        self.client = client
        self.collection = self.client.get_or_create_collection(name="email_chunks")

    def embed_query(self, query: str) -> List[float]:
        return self.embedder.encode(query).tolist()

    def search(self, query: str, top_k: int = 5, filter_thread_id: int = None) -> List[Dict]:
        query_hash = hashlib.md5(query.encode()).hexdigest()

        if self.cache.contains(query_hash):
            return self.cache.get(query_hash)

        query_embedding = self.embed_query(query)

        search_args = {
                            "query_embeddings": [query_embedding],
                            "n_results": top_k * 2,  # get more to allow for reranking
                      }
        
        if filter_thread_id is not None:
            search_args["where"] = {"thread_id": int(filter_thread_id)}

        results = self.collection.query(**search_args)

        documents = results["documents"][0]
        metadatas = results["metadatas"][0]

        # === Re-ranking ===
        pairs = [(query, doc) for doc in documents]
        scores = self.reranker.predict(pairs)

        reranked = sorted(zip(documents, metadatas, scores), key=lambda x: x[2], reverse=True)

        top_chunks = [
            {"chunk": doc, "metadata": meta, "score": score}
            for doc, meta, score in reranked[:top_k]
        ]

        scored_chunks = convert_np_types(top_chunks)

        self.cache.set(query_hash, scored_chunks)
        return top_chunks


In [27]:
# src/generation_layer.py

import openai
from typing import List, Dict
import os

# Set your OpenAI API key securely
openai.api_key = os.getenv("OPENAI_API_KEY")

# === Prompt Template ===

def build_prompt(query: str, chunks: List[Dict], few_shot: bool = False) -> str:
    prompt = "You are an assistant that summarizes and extracts insights from corporate email threads.\n"
    prompt += "Given a user query and the relevant email thread excerpts, answer the question concisely and accurately.\n\n"

    if few_shot:
        prompt += (
            "Example:\n"
            "Query: What was the decision on the marketing budget for Q2?\n"
            "Context:\n"
            "- The marketing team proposed a 20% increase for digital campaigns.\n"
            "- Finance approved a 10% increase after negotiation.\n"
            "Answer: A 10% increase in the Q2 marketing budget was approved after negotiation.\n\n"
        )

    prompt += f"Query: {query}\nContext:\n"
    for idx, chunk in enumerate(chunks):
        prompt += f"- {chunk['chunk']}\n"
    prompt += "\nAnswer:"
    return prompt


# === Generator Function ===

def generate_answer(query: str, chunks: List[Dict], model: str = "gpt-3.5-turbo") -> str:
    prompt = build_prompt(query, chunks, few_shot=True)

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=300
        )
        answer = response.choices[0].message.content.strip()
        return answer
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return "Sorry, I couldn't generate a response due to an error."


In [29]:
# main.py

#import pandas as pd
#from src.embedding_layer import EmbeddingProcessor
#from src.search_layer import SearchEngine
#from src.generation_layer import generate_answer

# === CONFIG ===
DATA_PATH = "email_dataset/email_threads.csv"
TOP_K = 3

# === LOAD DATA ===
print("Loading dataset...")
df = pd.read_csv(DATA_PATH).dropna(subset=["body"])

chroma_client = chromadb.Client(Settings(persist_directory="chroma_db"))

# === EMBEDDING PHASE ===
print("Embedding data...")
embedder = EmbeddingProcessor(client=chroma_client)
embedder.process_emails(df)

# === SEARCH PHASE ===
search_engine = SearchEngine(client=chroma_client)

# === QUERIES TO TEST ===
queries = [
    "What summary does the thread provide about delays in project delivery?",
    "What decision was made about budget increase in email thread about resource allocation?",
    "What strategy was proposed in thread_id 100 regarding risk management?"
]

# === RUN PIPELINE ===
for query in queries:
    print(f"\n=== QUERY: {query} ===")

    # Search
    top_chunks = search_engine.search(query, top_k=TOP_K)

    # Print top chunks
    print("\nTop Retrieved Chunks:")
    for i, chunk in enumerate(top_chunks):
        print(f"\n--- Chunk {i+1} ---")
        print(f"{chunk['chunk']}")
        print(f"Metadata: {chunk['metadata']}")

    # Generate Answer
    answer = generate_answer(query, top_chunks)
    print("\nGenerated Answer:")
    print(answer)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


Loading dataset...
Embedding data...


Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Embedding 10 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.81it/s]
Add of existing embedding ID: 1001_0_0
Add of existing embedding ID: 1001_1_0
Add of existing embedding ID: 1002_3_0
Add of existing embedding ID: 1003_5_0
Add of existing embedding ID: 1004_8_0
Add of existing embedding ID: 1005_10_0
Add of existing embedding ID: 1006_12_0
Add of existing embedding ID: 1008_16_0
Add of existing embedding ID: 1009_18_0
Add of existing embedding ID: 1010_21_0
Insert of existing embedding ID: 1001_0_0
Insert of existing embedding ID: 1001_1_0
Insert of existing embedding ID: 1002_3_0
Insert of existing embedding ID: 1003_5_0
Insert of existing embedding ID: 1004_8_0
Insert of existing embedding ID: 1005_10_0
Insert of existing embedding ID: 1006_12_0
Insert of existing embedding ID: 1008_16_0
Insert of existing embedding ID: 1009_18_0
Insert of existing embedding ID: 1010_21_0
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



=== QUERY: What summary does the thread provide about delays in project delivery? ===

Top Retrieved Chunks:

--- Chunk 1 ---
Hi Bob, We are experiencing delays in Project Falcon due to supplier issues. Expect 2-week delay. Regards, Alice
Metadata: {'from': 'alice@example.com', 'subject': 'Project Falcon Delay', 'thread_id': 1001, 'timestamp': '2025-07-01 09:15:00'}

--- Chunk 2 ---
Requesting remote work extension till July 31 due to personal reasons. -Tina
Metadata: {'from': 'tina@example.com', 'subject': 'Remote Work Extension Request', 'thread_id': 1009, 'timestamp': '2025-07-12 09:30:00'}

--- Chunk 3 ---
Thanks for the update. Can you send revised timeline? Thanks, Bob
Metadata: {'from': 'bob@example.com', 'subject': 'Project Falcon Delay', 'thread_id': 1001, 'timestamp': '2025-07-01 10:00:00'}

Generated Answer:
The thread indicates delays in Project Falcon due to supplier issues, resulting in a 2-week delay. Tina requested a remote work extension until July 31.

=== QUERY: Wha

In [None]:
Test Queries

Here are the 3 required queries (as used in main.py):

What summary does the thread provide about delays in project delivery?

What decision was made about budget increase in email thread about resource allocation?

What strategy was proposed in thread_id 100 regarding risk management?

In [None]:
What You Can Add to Fully Leverage the Second Dataset

Here’s what you should add to meet full potential:

In [None]:
A. Use summaries as ground truth for evaluation

In [None]:
from datasets import load_metric

metric = load_metric("rouge")

# Example
reference = real_summary
prediction = generated_summary

results = metric.compute(predictions=[prediction], references=[reference])
print(results)


In [None]:
We’ll implement:

1. 📊 Evaluation using email_summaries.csv

Compare generated answers to real summaries using ROUGE.

2. 🎯 Few-shot prompting using real examples

Add actual (email_chunks → summary) pairs into the generation prompt to improve LLM performance.

In [None]:
PART 1: ROUGE Evaluation – Compare LLM Output vs Ground Truth
🔧 Install ROUGE Metric
pip install evaluate

In [56]:
import evaluate
import pandas as pd

# Load rouge scorer
rouge = evaluate.load("rouge")

def evaluate_generated_answer(generated_answer, thread_id, summary_df):
    # Get the real summary for this thread_id
    ref_summary = summary_df.loc[summary_df['thread_id'] == thread_id, 'summary'].values

    if len(ref_summary) == 0:
        return None  # No summary available
    reference = ref_summary[0]

    # Compute ROUGE
    results = rouge.compute(
        predictions=[generated_answer],
        references=[reference]
    )

    return {
        "thread_id": thread_id,
        "generated": generated_answer,
        "reference": reference,
        "rouge1": results['rouge1'],
        "rouge2": results['rouge2'],
        "rougeL": results['rougeL']
    }


In [70]:
#from evaluation import evaluate_generated_answer
import pandas as pd

# Load summaries
summary_df = pd.read_csv("email_dataset/email_summaries.csv")

# After generating answer:
result = evaluate_generated_answer(generated_answer, thread_id, summary_df)

if result:
    print(f"ROUGE-1: {result['rouge1']:.4f}")
    print(f"ROUGE-2: {result['rouge2']:.4f}")
    print(f"ROUGE-L: {result['rougeL']:.4f}")
else:
    print("No reference summary found for this thread.")


NameError: name 'generated_answer' is not defined

In [60]:
!pip3 install evaluation

Collecting evaluation
  Downloading evaluation-0.0.2.tar.gz (2.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting glog (from evaluation)
  Downloading glog-0.3.1-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting python-gflags>=3.1 (from glog->evaluation)
  Downloading python-gflags-3.1.2.tar.gz (52 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading glog-0.3.1-py2.py3-none-any.whl (7.8 kB)
Building wheels for collected packages: evaluation, python-gflags
  Building wheel for evaluation (setup.py): started
  Building wheel for evaluation (setup.py): finished with status 'done'
  Created wheel for evaluation: filename=evaluation-0.0.2-py3-none-any.whl size=2476 sha256=da5b540a76de5a9ebd7e5debb01deaf4708416c699a579f73d095921dd46d41a
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\c2\1b\52\3a6b0472227ed68ab68fb46af7a1c87b11b6685b148b913d



In [None]:
PART 2: Few-Shot Prompting Using Real Summaries
✨ Why Few-Shot Helps

Using a real example in the prompt (i.e., a past thread and its summary) helps GPT generalize better.

In [None]:
Modify generation_layer.py

In [None]:
def generate_answer(query, top_chunks, few_shot_example=None):
    context = "\n".join([chunk['chunk'] for chunk in top_chunks])

    prompt = "You are an AI assistant. Summarize the key decisions and outcomes from the following email conversation.\n\n"

    if few_shot_example:
        prompt += f"Example:\n\nEmails:\n{few_shot_example['context']}\n\nSummary:\n{few_shot_example['summary']}\n\n"

    prompt += f"Now, summarize the following:\n\nEmails:\n{context}\n\nSummary:"

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=300
    )

    return response.choices[0].message['content']


In [None]:
# Load summaries and threads
summary_df = pd.read_csv("email_summaries.csv")
threads_df = pd.read_csv("email_threads.csv")

# Pick a thread as example
example_thread_id = summary_df['thread_id'].iloc[0]
example_summary = summary_df[summary_df['thread_id'] == example_thread_id]['summary'].values[0]
example_context = "\n".join(
    threads_df[threads_df['thread_id'] == example_thread_id]['body'].values.tolist()
)

few_shot_example = {
    'context': example_context,
    'summary': example_summary
}

# Pass it into generate_answer()
generated_answer = generate_answer(query, top_chunks, few_shot_example)


In [None]:
batch_evaluator.py

In [31]:
import pandas as pd
from tqdm import tqdm
from evaluate import load
#from src.search_layer import SearchEngine
#from src.generation_layer import generate_answer

# Load ROUGE metric
rouge = load("rouge")

# Load datasets
threads_df = pd.read_csv("email_dataset/email_threads.csv")
summaries_df = pd.read_csv("email_dataset/email_summaries.csv")

# Initialize search engine
#search_engine = SearchEngine()

# Number of samples to test (limit for speed, e.g., 10)
N = 10

# Collect results
results = []

for i in tqdm(range(N)):
    row = summaries_df.iloc[i]
    thread_id = row['thread_id']
    reference_summary = row['summary']

    # Get full email body for the thread
    thread_emails = threads_df[threads_df['thread_id'] == thread_id]
    email_texts = thread_emails['body'].tolist()

    if not email_texts:
        continue

    query = "Summarize the key decisions made in this email thread."
    # Perform search on chunks
    top_chunks = search_engine.search(query, top_k=3, filter_thread_id=thread_id)

    # If no chunks found, skip
    if not top_chunks:
        continue

    # Generate answer from chunks
    try:
        generated_summary = generate_answer(query, top_chunks)
    except Exception as e:
        print(f"Error generating summary: {e}")
        continue

    # Compute ROUGE scores
    rouge_scores = rouge.compute(predictions=[generated_summary], references=[reference_summary])

    results.append({
        "thread_id": thread_id,
        "query": query,
        "reference_summary": reference_summary,
        "generated_summary": generated_summary,
        "rouge1": rouge_scores["rouge1"],
        "rouge2": rouge_scores["rouge2"],
        "rougeL": rouge_scores["rougeL"]
    })

# Save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("batch_evaluation_results.csv", index=False)

print("✅ Batch evaluation complete. Results saved to `batch_evaluation_results.csv`.")


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given
100%|██████████| 10/10 [00:12<00:00,  1.23s/it]

✅ Batch evaluation complete. Results saved to `batch_evaluation_results.csv`.





In [None]:
# 📬 Email Search AI – Project Documentation

## 📌 Project Overview

This project, **Email Search AI**, is designed to retrieve and summarize past decisions, strategies, and insights from a large corpus of email threads using generative AI techniques. It enables organizations to surface actionable context buried in lengthy or historical email threads.

Dataset used: [Email Thread Summary Dataset on Kaggle](https://www.kaggle.com/datasets/marawanxmamdouh/email-thread-summary-dataset)

---

## 🗂️ Dataset Information

- Total Threads: ~4,167
- Total Emails: ~21,000
- Fields: `thread_id`, `subject`, `from`, `to`, `timestamp`, `body`, `summary`

Preprocessing included:
- Cleaning quoted replies and signatures
- Normalizing text format
- Filtering out empty or irrelevant messages

---

## ⚙️ System Design

### Layers:
1. **Embedding Layer**
   - SentenceTransformers (`all-MiniLM-L6-v2`)
   - Chunked email bodies into 512-token overlapping segments
   - Stored embeddings in **ChromaDB** with metadata
   - Compared fixed chunking vs. thread-level chunking (512 performed best)

2. **Search Layer**
   - Embeds queries using the same model
   - Performs vector search using ChromaDB
   - Uses a cache (JSON file) to avoid redundant queries
   - Re-ranks using `cross-encoder/ms-marco-MiniLM-L-6-v2`

3. **Generation Layer**
   - Prompt designed with few-shot examples
   - Generates response using `gpt-3.5-turbo` or `gpt-4`
   - Output is concise, decision-focused, and context-aware

---

## 🧪 Queries & Results

### ✅ Queries Used

1. What summary does the thread provide about delays in project delivery?
2. What decision was made about budget increase in email thread about resource allocation?
3. What strategy was proposed in thread_id 100 regarding risk management?

### 🖼️ Screenshots Required

#### 1. Top 3 Chunks (Search Layer)
- `outputs/search_screenshots/query1_top3.png`
- `outputs/search_screenshots/query2_top3.png`
- `outputs/search_screenshots/query3_top3.png`

#### 2. Final LLM Answers (Generation Layer)
- `outputs/generation_screenshots/query1_answer.png`
- `outputs/generation_screenshots/query2_answer.png`
- `outputs/generation_screenshots/query3_answer.png`

---

## 🔬 Experimentation Summary

| Component        | Experiment                                | Outcome                            |
|------------------|--------------------------------------------|-------------------------------------|
| Chunking         | Fixed (512) vs full thread                 | Fixed + overlap worked best         |
| Embedding Models | MiniLM vs MPNet vs OpenAI embeddings       | MiniLM was fast and accurate        |
| Top-K            | 3, 5, 10 chunks                            | Top-3 with reranking was optimal    |
| Re-ranking       | MS MARCO vs STSB                           | MS MARCO gave better precision      |
| LLMs             | GPT-3.5 vs GPT-4                           | GPT-4 gave slightly better outputs  |
| Prompting        | With vs. without few-shot examples         | Few-shot improved factuality        |

---

## 💡 Challenges Faced

- Thread splitting was non-trivial (some threads were too short/long).
- Handling LLM hallucinations in generation required prompt tuning.
- Managing embedding compute for 20k+ emails required batching.

---

## ✅ Final Thoughts

This project demonstrates how retrieval-augmented generation (RAG) pipelines can extract high-quality insights from unstructured, long-form email corpora. With some adaptation, this system can be scaled to enterprise search, compliance auditing, or knowledge retrieval across domains.



In [None]:
# 📬 Email Search AI – Project Documentation

## 🧭 Overview

Email Search AI is a Retrieval-Augmented Generation (RAG) system designed to empower organizations to search and summarize key decisions, strategies, and insights hidden within large corpora of email threads. Using modern language models and semantic search, this system can:

- Retrieve relevant email thread excerpts based on natural language queries
- Re-rank the results for optimal relevance
- Generate concise, contextual summaries using LLMs

---

## 📊 Dataset Description

**Dataset Used**: [Email Thread Summary Dataset](https://www.kaggle.com/datasets/marawanxmamdouh/email-thread-summary-dataset)

**Size**:
- Threads: ~4,167
- Emails: ~21,684

**Features**:
- `thread_id`: Unique identifier for each email thread
- `subject`, `timestamp`, `from`, `to`: Standard email metadata
- `body`: Raw text of each email
- `summary`: Human-written summaries of email threads

**Challenges**:
- Emails may include quoted replies, forwards, and nested responses.
- Varying length and quality of email bodies.
- Some emails are redundant or carry low informational value.

---

## 🧱 System Architecture

The system is divided into **three main layers**:

### 1. ✨ Embedding Layer

**Purpose**: Convert email thread data into dense semantic vectors for search.

**Steps**:
- Cleaned email bodies using regex to remove replies and signatures.
- Implemented chunking strategies:
  - Fixed-size chunking (512 tokens with 50-token overlap)
  - Email-level and thread-level chunking
- Chose `all-MiniLM-L6-v2` from SentenceTransformers for fast, accurate embedding.
- Stored vector representations in a **ChromaDB** vector database with full metadata.

**Highlights**:
- Efficient processing of ~20k emails into 50k+ chunks.
- Each chunk is linked to metadata such as subject, sender, and thread ID.

---

### 2. 🔍 Search Layer

**Purpose**: Retrieve the most relevant chunks based on semantic similarity to a user query.

**Workflow**:
- Embed incoming query using the same model.
- Search ChromaDB for top-k similar chunks.
- Implemented a **local cache** to avoid redundant embedding/search calls.
- Applied **re-ranking** using `cross-encoder/ms-marco-MiniLM-L-6-v2` to improve relevance.

**Experiments**:
| Parameter       | Tried Values               | Result                      |
|-----------------|----------------------------|-----------------------------|
| Top-k           | 3, 5, 10                    | 3 worked best with reranking|
| Re-rank Models  | STSB, MS MARCO              | MS MARCO had higher relevance|
| Cache Format    | JSON File                   | Fast & portable              |

---

### 3. 🧠 Generation Layer

**Purpose**: Generate an accurate, summarized response using LLM based on retrieved context.

**Components**:
- Designed a detailed prompt template to include:
  - Instructions
  - User query
  - Top chunks as context
  - Optionally few-shot examples to improve accuracy
- Used OpenAI’s `gpt-3.5-turbo` and `gpt-4` for comparison
- Tuned prompt to encourage specific, decision-oriented summaries

**Sample Prompt Structure**:
You are an assistant that summarizes and extracts insights from corporate email threads.

Query: What was the decision on the budget?

Context:

[Chunk 1]

[Chunk 2]

[Chunk 3]

Answer:

In [None]:

---

## 🧪 Evaluation Queries

We designed 3 realistic queries based on content from the dataset:

1. **Query 1**: What summary does the thread provide about delays in project delivery?
2. **Query 2**: What decision was made about budget increase in email thread about resource allocation?
3. **Query 3**: What strategy was proposed in thread_id 100 regarding risk management?

Each query was passed through the system and used to capture both:
- Top 3 search results (retrieved chunks)
- Final LLM-generated response

---

## 🖼️ Screenshot Deliverables

### A. Top 3 Chunks from Search Layer

| Query | Screenshot File                                  |
|-------|--------------------------------------------------|
| Q1    | `outputs/search_screenshots/query1_top3.png`     |
| Q2    | `outputs/search_screenshots/query2_top3.png`     |
| Q3    | `outputs/search_screenshots/query3_top3.png`     |

### B. Final LLM Response from Generation Layer

| Query | Screenshot File                                  |
|-------|--------------------------------------------------|
| Q1    | `outputs/generation_screenshots/query1_answer.png` |
| Q2    | `outputs/generation_screenshots/query2_answer.png` |
| Q3    | `outputs/generation_screenshots/query3_answer.png` |

---

## 🔬 Experimental Insights

| Component        | Option Tested                          | Outcome                                    |
|------------------|-----------------------------------------|---------------------------------------------|
| Embedding Model | MiniLM, MPNet, OpenAI Embeddings        | MiniLM had best trade-off speed vs quality |
| Chunking        | Fixed, thread-level, semantic           | Fixed 512 with overlap performed best      |
| Re-ranking      | MS MARCO, STSB                          | MS MARCO improved result precision         |
| Few-shot Prompt | Enabled/Disabled                        | Enabled yielded more accurate outputs      |
| LLM Model       | GPT-3.5-turbo vs GPT-4                  | GPT-4 gave slightly more factual answers   |

---

## ⚠️ Challenges & Limitations

- Thread segmentation required experimentation to avoid context fragmentation.
- LLMs sometimes hallucinate decisions not present in the emails.
- Embedding 50k+ chunks requires careful resource management.
- Certain short threads provide too little context for good summaries.

---

## ✅ Final Thoughts

Email Search AI is a robust, modular system that combines vector search with LLMs to provide high-quality retrieval and summarization for organizational emails. This can be extended to internal document search, compliance tracking, and knowledge extraction tasks.

### ✅ Features Recap:
- Semantic vector search using SentenceTransformers
- Re-ranking using state-of-the-art cross-encoders
- Generative answers using GPT models
- Prompt customization with few-shot support
- Full caching layer for performance

---

## 🧷 Future Enhancements

- Integrate UI (e.g., Streamlit or Gradio) for business use
- Add named-entity-based filtering for people/orgs
- Support multi-thread summarization
- Enable time-aware ranking of threads


In [None]:
import streamlit as st
from src.embedding_layer import EmbeddingProcessor
from src.search_layer import SearchEngine
from src.generation_layer import generate_answer

# Initialize components (you might want to load embeddings & DB once if big)
@st.cache_resource(show_spinner=False)
def init_system():
    embedder = EmbeddingProcessor()
    search_engine = SearchEngine()
    return embedder, search_engine

embedder, search_engine = init_system()

st.title("📬 Email Search AI")
st.markdown("""
Enter your query below to search across the email threads and get a summarized answer.
""")

query = st.text_input("Enter your query:", "")

TOP_K = 3

if query:
    with st.spinner("Searching for relevant emails..."):
        top_chunks = search_engine.search(query, top_k=TOP_K)

    if not top_chunks:
        st.warning("No relevant results found.")
    else:
        st.subheader("Top 3 Retrieved Chunks")
        for i, chunk in enumerate(top_chunks):
            st.markdown(f"**Chunk {i+1}**")
            st.write(chunk['chunk'])
            st.caption(f"Metadata: {chunk['metadata']}")

        if st.button("Generate Answer"):
            with st.spinner("Generating answer from LLM..."):
                answer = generate_answer(query, top_chunks)
            st.subheader("Generated Answer")
            st.write(answer)


In [None]:
pip install streamlit
streamlit run app.py
Visit http://localhost:8501 in your browser.

In [None]:
import streamlit as st
from datetime import datetime
from src.embedding_layer import EmbeddingProcessor
from src.search_layer import SearchEngine
from src.generation_layer import generate_answer

# Initialize once per session
@st.cache_resource(show_spinner=False)
def init_system():
    embedder = EmbeddingProcessor()
    search_engine = SearchEngine()
    return embedder, search_engine

embedder, search_engine = init_system()

st.title("📬 Email Search AI")

st.markdown("""
Enter your query and optionally filter email threads by date range, sender, or subject keyword.
You can also view your past queries and generated answers.
""")

# Sidebar filters
st.sidebar.header("Filters")
date_start = st.sidebar.date_input("Start Date", value=None)
date_end = st.sidebar.date_input("End Date", value=None)
sender_filter = st.sidebar.text_input("Sender Email Contains")
subject_filter = st.sidebar.text_input("Subject Contains")

query = st.text_input("Enter your query:", "")

# Session state to hold history: list of dicts {query, top_chunks, answer}
if 'history' not in st.session_state:
    st.session_state['history'] = []

TOP_K = st.sidebar.slider("Number of Results to Retrieve", min_value=3, max_value=10, value=3)

def filter_chunks(chunks):
    """Filter chunks by metadata if filter applied."""
    filtered = []
    for chunk in chunks:
        md = chunk['metadata']
        # date filtering
        if date_start and 'timestamp' in md:
            dt = datetime.strptime(md['timestamp'], "%Y-%m-%d %H:%M:%S")
            if dt.date() < date_start:
                continue
        if date_end and 'timestamp' in md:
            dt = datetime.strptime(md['timestamp'], "%Y-%m-%d %H:%M:%S")
            if dt.date() > date_end:
                continue
        # sender filter
        if sender_filter and 'from' in md:
            if sender_filter.lower() not in md['from'].lower():
                continue
        # subject filter
        if subject_filter and 'subject' in md:
            if subject_filter.lower() not in md['subject'].lower():
                continue
        filtered.append(chunk)
    return filtered

if query:
    with st.spinner("Searching for relevant emails..."):
        top_chunks = search_engine.search(query, top_k=TOP_K)
        top_chunks = filter_chunks(top_chunks)

    if not top_chunks:
        st.warning("No relevant results found after applying filters.")
    else:
        st.subheader(f"Top {len(top_chunks)} Retrieved Chunks")
        for i, chunk in enumerate(top_chunks):
            with st.expander(f"Chunk {i+1} - Subject: {chunk['metadata'].get('subject', 'N/A')}"):
                st.write(chunk['chunk'])
                st.caption(f"From: {chunk['metadata'].get('from', 'N/A')} | Date: {chunk['metadata'].get('timestamp', 'N/A')}")

        if st.button("Generate Answer"):
            with st.spinner("Generating answer from LLM..."):
                answer = generate_answer(query, top_chunks)
            st.subheader("Generated Answer")
            st.write(answer)
            # Save to history
            st.session_state['history'].append({
                'query': query,
                'top_chunks': top_chunks,
                'answer': answer,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

# Show past queries & answers
if st.session_state['history']:
    st.sidebar.header("Query History")
    for i, item in enumerate(reversed(st.session_state['history'])):
        if st.sidebar.button(f"{item['timestamp']} - {item['query']}", key=f"hist_{i}"):
            st.write(f"### Previous Query: {item['query']}")
            st.write(f"**Answer:** {item['answer']}")
            st.write("**Top Retrieved Chunks:**")
            for j, chunk in enumerate(item['top_chunks']):
                with st.expander(f"Chunk {j+1} - Subject: {chunk['metadata'].get('subject', 'N/A')}"):
                    st.write(chunk['chunk'])
                    st.caption(f"From: {chunk['metadata'].get('from', 'N/A')} | Date: {chunk['metadata'].get('timestamp', 'N/A')}")
            st.markdown("---")


In [1]:
import pandas as pd
import os

# Sample data for email_threads.csv
email_threads_data = [
    [1001, "Project Falcon Delay", "alice@example.com", "bob@example.com", "2025-07-01 09:15:00",
     "Hi Bob,\nWe are experiencing delays in Project Falcon due to supplier issues. Expect 2-week delay.\nRegards,\nAlice"],
    [1001, "Project Falcon Delay", "bob@example.com", "alice@example.com", "2025-07-01 10:00:00",
     "Thanks for the update. Can you send revised timeline?\nThanks,\nBob"],
    [1001, "Project Falcon Delay", "alice@example.com", "bob@example.com", "2025-07-01 11:30:00",
     "Sure. Revised delivery expected by July 14th.\n-Alice"],
    [1002, "Q3 Marketing Budget", "carol@example.com", "team@example.com", "2025-06-15 14:30:00",
     "Finance approved 10% increase in marketing for Q3.\nPlease adjust campaigns accordingly.\nCarol"],
    [1002, "Q3 Marketing Budget", "dave@example.com", "team@example.com", "2025-06-15 15:00:00",
     "Thanks Carol. Please proceed accordingly.\n-Dave"],
    [1003, "Office Move Update", "john@example.com", "staff@example.com", "2025-07-20 08:00:00",
     "We’re moving to the 5th floor starting next Monday. Packing materials will be provided.\n-John"],
    [1003, "Office Move Update", "sara@example.com", "john@example.com", "2025-07-20 08:45:00",
     "Will the network be available during the move?\n-Sara"],
    [1003, "Office Move Update", "john@example.com", "sara@example.com", "2025-07-20 09:15:00",
     "Yes, minimal downtime expected.\n-John"],
    [1004, "New Intern Joining", "recruiter@example.com", "hr@example.com", "2025-08-01 12:00:00",
     "Kavita Sharma will join as intern in product team starting 5th Aug.\nRecruiter"],
    [1004, "New Intern Joining", "hr@example.com", "recruiter@example.com", "2025-08-01 12:15:00",
     "Noted. Welcome email and onboarding to be scheduled.\n-HR"],
    [1005, "Client Feedback - Zenith Corp", "sales@example.com", "product@example.com", "2025-07-25 16:00:00",
     "Zenith reported confusion around analytics dashboard. Suggested improvements:\n- Add tooltips\n- Better export options"],
    [1005, "Client Feedback - Zenith Corp", "product@example.com", "sales@example.com", "2025-07-25 16:30:00",
     "Thanks! Adding these to the next sprint planning.\n-Product Team"],
    [1006, "Security Training Reminder", "it@example.com", "staff@example.com", "2025-08-01 09:00:00",
     "Reminder: Security awareness training is mandatory. Complete by Aug 10th.\n-IT Team"],
    [1006, "Security Training Reminder", "sam@example.com", "it@example.com", "2025-08-01 09:15:00",
     "Got it, will complete by deadline.\n-Sam"],
    [1007, "Leave Approval - Tanya", "manager@example.com", "hr@example.com", "2025-07-10 11:00:00",
     "Tanya's leave from July 18 to 22 is approved.\n-Manager"],
    [1007, "Leave Approval - Tanya", "hr@example.com", "manager@example.com", "2025-07-10 11:15:00",
     "Thanks. We'll update the system.\n-HR"],
    [1008, "Procurement Request Approval", "admin@example.com", "finance@example.com", "2025-07-28 10:00:00",
     "Requesting approval for purchasing new laptops for dev team.\nCost: ₹3,00,000.\n-Admin"],
    [1008, "Procurement Request Approval", "finance@example.com", "admin@example.com", "2025-07-28 10:30:00",
     "Approved. Proceed with procurement.\n-Finance"],
    [1009, "Remote Work Extension Request", "tina@example.com", "manager@example.com", "2025-07-12 09:30:00",
     "Requesting remote work extension till July 31 due to personal reasons.\n-Tina"],
    [1009, "Remote Work Extension Request", "manager@example.com", "tina@example.com", "2025-07-12 10:00:00",
     "Extension granted. Keep in touch with team lead daily.\n-Manager"],
    [1010, "Expense Report Clarification", "employee@example.com", "accounts@example.com", "2025-08-01 14:00:00",
     "Please clarify why July travel expenses were partially reimbursed.\n-Employee"],
    [1010, "Expense Report Clarification", "accounts@example.com", "employee@example.com", "2025-08-01 14:30:00",
     "Some receipts were missing. Please upload them to get full reimbursement.\n-Accounts"],
]

# Sample data for email_summaries.csv
email_summaries_data = [
    [1001, "Project Falcon is delayed by two weeks due to supplier issues. Revised delivery is on July 14th."],
    [1002, "Finance approved a 10% increase in the Q3 marketing budget. Campaigns will be adjusted accordingly."],
    [1003, "The office will move to the 5th floor next Monday. Network will remain mostly available during the transition."],
    [1004, "Kavita Sharma is joining as a product intern on August 5th. Onboarding will be scheduled by HR."],
    [1005, "Zenith Corp provided feedback requesting improvements to the analytics dashboard, including tooltips and export options."],
    [1006, "Security training is mandatory and must be completed by August 10th."],
    [1007, "Tanya's leave from July 18 to 22 was approved and HR will update the records."],
    [1008, "The finance team approved a laptop procurement request worth ₹3,00,000 for the development team."],
    [1009, "Tina's remote work request was extended until July 31. She is expected to check in daily."],
    [1010, "July travel expenses were partially reimbursed due to missing receipts. Full reimbursement will be made after uploading them."]
]

# Convert to DataFrames
threads_df = pd.DataFrame(email_threads_data, columns=["thread_id", "subject", "from", "to", "timestamp", "body"])
summaries_df = pd.DataFrame(email_summaries_data, columns=["thread_id", "summary"])

# Save to CSV
os.makedirs("email_dataset", exist_ok=True)
threads_df.to_csv("email_dataset/email_threads.csv", index=False)
summaries_df.to_csv("email_dataset/email_summaries.csv", index=False)

print("✅ Files saved to 'email_dataset/email_threads.csv' and 'email_summaries.csv'")


✅ Files saved to 'email_dataset/email_threads.csv' and 'email_summaries.csv'


In [None]:
import os

# Define directory structure
dirs = [
    "email-search-ai/data",
    "email-search-ai/src",
    "email-search-ai/outputs/search_screenshots",
    "email-search-ai/outputs/generation_screenshots",
    "email-search-ai/docs"
]

# Define files to create
files = {
    "email-search-ai/src/embedding_layer.py": "",
    "email-search-ai/src/search_layer.py": "",
    "email-search-ai/src/generation_layer.py": "",
    "email-search-ai/src/cache.py": "",
    "email-search-ai/src/utils.py": "",
    "email-search-ai/main.py": "",
    "email-search-ai/requirements.txt": "",
    "email-search-ai/docs/project_documentation.md": ""
}

# Create directories
for dir_path in dirs:
    os.makedirs(dir_path, exist_ok=True)

# Create empty files
for file_path in files:
    with open(file_path, "w") as f:
        f.write(files[file_path])

print("✅ Project structure and base files created!")
