In [18]:
import numpy as np
from dotenv import load_dotenv
import os
import logging
import sys
import json
from openai import OpenAI
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
load_dotenv()

import sys
import json
import os

sys.path.append("..")  # Adds the parent directory to sys path

from mailio_ai_libs.create_embeddings import Embedder

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
base_data_dir = "../data"
data_dir = f"{base_data_dir}/embeddings_distilbert_base_uncased_mean_pooling"
embeddings_path = f"{data_dir}/embeddings.npy"
index_path = f"{data_dir}/embeddings_index.npy"

In [3]:
# load embeddings
embeddings = np.load(embeddings_path)

In [4]:
# similarity matrix for removing too similar documents in the dataset
similarity_matrix = cosine_similarity(embeddings)

In [94]:
# Obtain indices of lower triangular matrix (below the the diagonal, which should always be 1)
similarity_matrix[np.tril_indices_from(similarity_matrix)] = 0

In [95]:
# indices of too similar documents
threshold = 0.6 # trying to find very non-similar documents so the queries are more diverse
duplicates = np.argwhere(similarity_matrix > threshold)

In [96]:
# Deduplicate: Create a set of indices to keep
to_remove = set()
for i, j in duplicates:
    # Remove the second occurrence of the pair
    to_remove.add(j)

In [99]:
# from the index skip the duplicates
index = np.load(index_path)
non_dups_index = np.delete(index, list(to_remove))

In [100]:
index.shape, non_dups_index.shape

((16328,), (1415,))

In [61]:
# using llm to form the questions for specific emails as a user would (approximation)
llm = OpenAI(api_key=os.getenv("OPEN_API_KEY"))

In [101]:
non_dups_index[:2]

array(['<0c208ddf-f19b-4d5f-aff3-78abc6389906@ind1s06mta1526.xt.local>',
       '<180ed996-ee8a-4b95-a977-ca4c4ad3c87d@atl1s07mta1306.xt.local>'],
      dtype='<U181')

In [109]:
base_data_dir = "../data"
jsonl_files = [f for f in os.listdir(base_data_dir) if f.endswith(".jsonl") and os.path.isfile(os.path.join(base_data_dir, f))]

documents = []

# load the documents to get the form the queries for evaluation emails, while only using the non-duplicate documents

for file in jsonl_files:
    file_path = os.path.join(base_data_dir, file)
    with open(file_path, "r") as f:
        lines = f.read()
    lines = lines.split("\n")
    for line in tqdm(lines, desc=f"Processing {file}"):
        if line == "":
            continue
        j =  json.loads(line)
        subject = None
        if not "message_id" in j:
            continue
        message_id = j["message_id"]
        if np.isin(message_id, non_dups_index):
            
            sentences = []
            if "subject" in line:
                subject = j["subject"]
            if "sentences" in line:
                sentences = j["sentences"]
            
            if subject and len(sentences) > 0:
                text = subject + " " + " ".join(sentences)
            elif subject and len(sentences) == 0:
                text = subject
            documents.append({
                "text": text,
                "id": j["message_id"]
            })


Processing emails_inbox.jsonl: 100%|██████████| 2695/2695 [00:01<00:00, 2450.94it/s]
Processing emails_goodreads.jsonl: 100%|██████████| 4025/4025 [00:01<00:00, 2420.52it/s]
Processing emails_archive.jsonl: 100%|██████████| 5715/5715 [00:02<00:00, 2361.37it/s]


In [110]:
# randomly sample 100 emails
np.random.seed(42)
documents = np.random.choice(documents, 100, replace=False)

In [112]:
# openAI prompt for the formation of user like query
llm_query = """
You're a user using an email search client. 
Your task is to create a short and simple query to find the email in the dataset.
The query should be something a user might enter to find the email. For example, if the email is a receipt, the question might be "digitalocean bill in June 2021" or "John credit card".
"""

In [113]:
# openAI api call
def generate_query(user_input:str):
    completion = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": llm_query},
            {"role": "user", "content": user_input}
        ]
    )
    return completion.choices[0].message.content

In [114]:
# output queries for the emails, referenced by the message_id
output = []
for doc in documents:
    msg = generate_query(doc['text'])
    output.append({
        "text": doc['text'],
        "id": doc['id'],
        "query": msg
    })

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST

In [115]:
# save the queries and email ids as the evaluation dataset
os.makedirs("../data/evaluation_dataset", exist_ok=True)
with open("../data/evaluation_dataset/sample_queries.jsonl", "w") as f:
    for o in output:
        f.write(json.dumps(o) + "\n")

In [21]:
# not necessary step but simpler to develop
with open("../data/evaluation_dataset/sample_queries_cleaned.jsonl", "r") as f:
    lines = f.read()
json_lines = lines.split("\n")

In [22]:
# read all the queries and generate embeddings matching the sample_queries.jsonl
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

embedder = Embedder(model, tokenizer)

query_embeddings = []
for line in tqdm(json_lines, desc="Embedding queries"):
    if line:
        json_line = json.loads(line)
        query = json_line["query"]
        query_emb = embedder.embed([query])
        query_embeddings.extend(query_emb)

embeddings = np.vstack(query_embeddings)
np.save("../data/evaluation_dataset/query_embeddings.npy", embeddings)

INFO:datasets:PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
Use pytorch device_name: mps
Use pytorch device_name: mps
Use pytorch device_name: mps
Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Embedding queries: 100%|██████████| 89/89 [00:00<00:00, 185.88it/s]
