In [36]:
# !pip install openai PyMuPDF faiss-cpu python-dotenv

In [37]:
import os
import openai
import fitz  # PyMuPDF
import numpy as np
import faiss
import sqlite3
import pickle
import re
from dotenv import load_dotenv
from IPython.display import display, Markdown

load_dotenv()  # to load OPENAI_API_KEY from .env file
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [38]:
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def process_pdfs(pdf_files):
    chunks, sources = [], []
    for pdf_file in pdf_files:
        text = extract_pdf_text(pdf_file)
        pdf_chunks = chunk_text(text)
        chunks.extend(pdf_chunks)
        sources.extend([pdf_file] * len(pdf_chunks))
    return chunks, sources

def init_db(db_name="chunks.db"):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS chunks (
            id INTEGER PRIMARY KEY,
            source TEXT,
            chunk TEXT
        )
    """)
    conn.commit()
    return conn

def store_chunks(conn, chunks, sources):
    c = conn.cursor()
    c.executemany("INSERT INTO chunks (source, chunk) VALUES (?, ?)", zip(sources, chunks))
    conn.commit()

def retrieve_chunks(conn):
    c = conn.cursor()
    c.execute("SELECT id, source, chunk FROM chunks")
    return c.fetchall()


In [39]:
def create_embeddings(text_chunks, model="text-embedding-3-small"):
    embeddings = []
    for chunk in text_chunks:
        response = client.embeddings.create(input=[chunk], model=model)
        embeddings.append(response.data[0].embedding)
    return np.array(embeddings, dtype='float32')

def store_embeddings(embeddings, file_name="faiss.index"):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, file_name)
    return index

def load_embeddings(file_name="faiss.index"):
    return faiss.read_index(file_name)

In [40]:
def search_embeddings(query, index, text_chunks, sources, top_k=3, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[query], model=model)
    query_embedding = np.array(response.data[0].embedding, dtype='float32').reshape(1, -1)
    _, indices = index.search(query_embedding, top_k)
    return [(text_chunks[i], sources[i]) for i in indices[0]]

def get_chatgpt_response(query, relevant_chunks, model="gpt-4o"):
    context = "\n\n---\n\n".join([chunk for chunk, _ in relevant_chunks])
    prompt = f"Using the context, answer the question:\n\nContext:\n{context}\n\nQuestion: {query}"
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )
    return response.choices[0].message.content

In [41]:
def fix_latex_delimiters(text):
    text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', text, flags=re.DOTALL)
    text = re.sub(r'\\\((.*?)\\\)', r'$\1$', text, flags=re.DOTALL)
    return text

In [None]:
# List your PDF files
pdf_files = [
    "doc1.pdf",
    "doc2.pdf"
    ]  # Update this list

# Initialize database
conn = init_db()

# Process PDFs and store chunks
chunks, sources = process_pdfs(pdf_files)
store_chunks(conn, chunks, sources)

# Generate and store embeddings
embeddings = create_embeddings(chunks)
index = store_embeddings(embeddings)

# Close database connection
conn.close()

In [None]:
conn = sqlite3.connect("chunks.db")
index = load_embeddings("faiss.index")

chunk_data = retrieve_chunks(conn)
chunk_ids, chunk_sources, chunk_texts = zip(*chunk_data)

query = "Explain cosmological redshift equations"
relevant_chunks = search_embeddings(query, index, chunk_texts, chunk_sources, top_k=3)
answer = get_chatgpt_response(query, relevant_chunks)

answer_fixed = fix_latex_delimiters(answer)
display(Markdown(answer_fixed))

conn.close()

To determine the apparent tangential speed of the sources as seen from Earth, we need to convert the angular separation into a physical distance and then calculate the speed.

Given:
- Distance to the galaxy, $ d_A = 100 $ Mpc.
- Angular separation for one source, $ \theta_1 = 1.00 $ milliarcseconds.
- Angular separation for the other source, $ \theta_2 = 0.27 $ milliarcseconds.
- Time interval, $ \Delta t = 1 $ year.

First, convert the angular separations from milliarcseconds to radians:
1 milliarcsecond = $ 1 \times 10^{-3} $ arcseconds = $ 1 \times 10^{-3} \times \frac{\pi}{180 \times 3600} $ radians.

For the first source:
$$ \theta_1 = 1.00 \times 10^{-3} \times \frac{\pi}{180 \times 3600} \, \text{radians} $$

For the second source:
$$ \theta_2 = 0.27 \times 10^{-3} \times \frac{\pi}{180 \times 3600} \, \text{radians} $$

Next, calculate the physical distance traveled by each source using the formula $ l = \theta \times d_A $.

For the first source:
$$ l_1 = \theta_1 \times d_A $$

For the second source:
$$ l_2 = \theta_2 \times d_A $$

Convert the distance to meters (1 Mpc = $ 3.086 \times 10^{22} $ meters):
$$ d_A = 100 \times 3.086 \times 10^{22} \, \text{meters} $$

Calculate the physical distances:
$$ l_1 = \left(1.00 \times 10^{-3} \times \frac{\pi}{180 \times 3600}\right) \times (100 \times 3.086 \times 10^{22}) $$
$$ l_2 = \left(0.27 \times 10^{-3} \times \frac{\pi}{180 \times 3600}\right) \times (100 \times 3.086 \times 10^{22}) $$

Finally, calculate the apparent tangential speeds:
$$ v_1 = \frac{l_1}{\Delta t} $$
$$ v_2 = \frac{l_2}{\Delta t} $$

Since $\Delta t = 1$ year, convert it to seconds (1 year = $ 3.156 \times 10^7 $ seconds).

$$ v_1 = \frac{l_1}{3.156 \times 10^7} $$
$$ v_2 = \frac{l_2}{3.156 \times 10^7} $$

These calculations will give you the apparent tangential speeds of the two sources in meters per second.