In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [4]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [5]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1088, which is longer than the specified 1000


Total number of chunks: 123
Document types found: {'products', 'contracts', 'employees', 'company'}


In [6]:
pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client imapclient

Note: you may need to restart the kernel to use updated packages.


In [7]:
import os

# List files in the current directory
print(os.listdir())


['vector_db', 'Cover Letter Generator.ipynb', 'knowledge-base', 'credentials.json', '.gradio', '.ipynb_checkpoints', 'rag_implementation.ipynb']


In [12]:
from google_auth_oauthlib.flow import InstalledAppFlow

SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
creds = flow.run_local_server(port=0)  # Opens a browser to authenticate

# Save credentials for reuse
import pickle
with open("token.pickle", "wb") as token_file:
    pickle.dump(creds, token_file)


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=75585877953-gbc23lsmbmesd8l1idufa3klrnmoid35.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A62382%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=GBcj31gFxdoR0nzjZIMHMWUCUtuVdC&access_type=offline


In [13]:
from googleapiclient.discovery import build
import pickle

# Load saved credentials
with open("token.pickle", "rb") as token_file:
    creds = pickle.load(token_file)

service = build("gmail", "v1", credentials=creds)

# Fetch latest 10 emails
results = service.users().messages().list(userId="me", maxResults=10).execute()
messages = results.get("messages", [])

for msg in messages:
    msg_id = msg["id"]
    message = service.users().messages().get(userId="me", id=msg_id, format="full").execute()
    
    # Extract headers
    headers = message["payload"]["headers"]
    
    # Extract useful email information
    subject = next((h["value"] for h in headers if h["name"] == "Subject"), "No Subject")
    sender = next((h["value"] for h in headers if h["name"] == "From"), "Unknown Sender")
    snippet = message.get("snippet", "No Snippet Available")
    
    # Print Email Details
    print(f"📧 Email ID: {msg_id}")
    print(f"   🏷️ Subject: {subject}")
    print(f"   📤 From: {sender}")
    print(f"   🔍 Snippet: {snippet}\n")


📧 Email ID: 194ec572b8f987c2
   🏷️ Subject: Your Upcoming Interview with Education Pioneers
   📤 From: Education Pioneers sent by AppointmentPlus <noreply@appointment-plus.com>
   🔍 Snippet: Greetings Naufal, Just a quick note to remind you that you have an upcoming interview with a member of the Education Pioneers team. Please note that the interview time below is listed in EASTERN TIME.

📧 Email ID: 194ebd5b4d7172a8
   🏷️ Subject: Spreadsheet shared with you: "10 Feb - CMU Marketing Interview"
   📤 From: "Mohamad Naufal Nafian (via Google Sheets)" <drive-shares-dm-noreply@google.com>
   🔍 Snippet: Mohamad Naufal Nafian shared a spreadsheet Mohamad Naufal Nafian (mnafian@andrew.cmu.edu) has invited you to edit the following spreadsheet: 10 Feb - CMU Marketing Interview Open Google LLC, 1600

📧 Email ID: 194ebac152b12cbc
   🏷️ Subject: Re: All Questionnaires Received
   📤 From: Vosyn Recruitment <no-reply@vosyn.breezy-mail.com>
   🔍 Snippet: Dear Naufal, Thank you for submitting your a

In [73]:
import os
import pickle
import glob
from googleapiclient.discovery import build
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# ✅ Load a pre-trained NLP model for embedding (for better retrieval)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 🔹 Load Gmail API Credentials
def load_gmail_service():
    with open("token.pickle", "rb") as token_file:
        creds = pickle.load(token_file)
    service = build("gmail", "v1", credentials=creds)
    return service

# 🔹 Fetch Latest Emails with Timestamp
def fetch_emails(service, max_results=500):
    results = service.users().messages().list(userId="me", maxResults=max_results).execute()
    messages = results.get("messages", [])
    emails = []
    
    for msg in messages:
        msg_id = msg["id"]
        message = service.users().messages().get(userId="me", id=msg_id, format="metadata").execute()
        
        payload = message.get("payload", {}).get("headers", [])
        subject = next((header["value"] for header in payload if header["name"] == "Subject"), "No Subject")
        timestamp = message.get("internalDate", "0")  # Internal timestamp in milliseconds
        body = message.get("snippet", "")
        
        emails.append({
            "id": msg_id,
            "subject": subject,
            "body": body,
            "timestamp": int(timestamp)  # Convert to integer for sorting
        })

    # ✅ Sort emails chronologically (oldest first)
    emails.sort(key=lambda x: x["timestamp"])
    
    return emails

# 🔹 Enhanced Email Classification using NLP
def classify_email(email):
    subject, body = email["subject"], email["body"]
    email_text = f"{subject} {body}".lower()

    if any(keyword in email_text for keyword in ["job", "application", "intern", "hiring", "recruiter"]):
        return "recruitment"
    elif any(keyword in email_text for keyword in ["assignment", "course", "event", "exam", "lecture"]):
        return "school"
    elif any(keyword in email_text for keyword in ["newsletter", "promotion", "discount", "offer"]):
        return "promotions"
    elif any(keyword in email_text for keyword in ["verification", "subscription", "notification", "alert"]):
        return "general"
    else:
        return "other"

# 🔹 Store Emails in a Structured Way
def store_emails(emails):
    base_path = "email-knowledge-base"
    os.makedirs(base_path, exist_ok=True)
    
    for email in emails:
        category = classify_email(email)
        category_path = os.path.join(base_path, category)
        os.makedirs(category_path, exist_ok=True)
        
        file_path = os.path.join(category_path, f"{email['id']}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(f"Timestamp: {email['timestamp']}\n")
            f.write(f"Subject: {email['subject']}\n\n{email['body']}")

# 🔹 Load Emails into Vector Database with Chronological Metadata
def load_into_vector_db():
    folders = glob.glob("email-knowledge-base/*")

    documents = []
    for folder in folders:
        doc_type = os.path.basename(folder)
        loader = DirectoryLoader(folder, glob="**/*.txt", loader_cls=TextLoader)
        folder_docs = loader.load()
        for doc in folder_docs:
            doc.metadata["doc_type"] = doc_type
        documents.extend(folder_docs)

    # ✅ Extract timestamp safely, default to 0 if missing
    def extract_timestamp(doc):
        lines = doc.page_content.split("\n")
        if lines and lines[0].startswith("Timestamp: "):
            try:
                return int(lines[0].replace("Timestamp: ", "").strip())  # Convert timestamp
            except ValueError:
                return 0  # If conversion fails, set to 0 (oldest)
        return 0  # Default to 0 if no timestamp is found

    # ✅ Sort documents based on extracted timestamps
    documents.sort(key=extract_timestamp)

    # ✅ Adaptive chunking strategy (split by sentence)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["\n", " ", ""])
    chunks = text_splitter.split_documents(documents)

    print(f"Total number of chunks: {len(chunks)}")
    print(f"Email categories found: {set(doc.metadata['doc_type'] for doc in documents)}")

    # ✅ Print the first and last processed email for verification
    if documents:
        first_timestamp = extract_timestamp(documents[0])
        last_timestamp = extract_timestamp(documents[-1])
        print(f"📩 First processed email timestamp: {first_timestamp}")
        print(f"📩 Last processed email timestamp: {last_timestamp}")
    
    return chunks


# 🔹 Query Function with Improved Retrieval
def query_email_rag(query, index, chunks):
    query_embedding = embedding_model.encode([query])[0]

    # ✅ Retrieve the top 3 most relevant chunks
    distances, indices = index.search(np.array([query_embedding], dtype=np.float32), k=3)
    
    relevant_chunks = [chunks[i].page_content for i in indices[0]]
    
    print("🔍 Top Matching Chunks:")
    for chunk in relevant_chunks:
        print(f"📜 {chunk[:300]}...\n")

    return relevant_chunks

# 🔹 Main Execution
if __name__ == "__main__":
    service = load_gmail_service()
    emails = fetch_emails(service, max_results=300)
    store_emails(emails)
    chunks = load_into_vector_db()

Total number of chunks: 4504
Email categories found: {'recruitment', 'general', 'school', 'promotions', 'other'}
📩 First processed email timestamp: 0
📩 Last processed email timestamp: 1739134877000


In [74]:
print(f"Total number of emails processed: {len(emails)}")

Total number of emails processed: 300


In [75]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 4504 documents


In [79]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [80]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [81]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.
