In [None]:
import os
import json

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.documents import Document
from langchain import hub
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict


In [None]:
load_dotenv()
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]

In [None]:
osdi25_path = "data/osdi_atc25.json"
def json_to_docs(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    docs = []
    for session in data:
        for paper in data[session]:
            if paper["abstract"] is None:
                continue
            try:
                doc = Document(
                    page_content=paper["abstract"],
                    metadata={
                        "title": paper["title"],
                        "authors": paper["authors"],
                        "link": paper["link"],
                        "session": session
                    }
                )
                docs.append(doc)
            except Exception as e:
                print(e)
                print(paper)

    
    return docs

docs = []
docs += json_to_docs("data/osdi_atc25.json")
docs += json_to_docs("data/osdi24_sessions.json")
docs += json_to_docs("data/nsdi25_sessions.json")
docs += json_to_docs("data/sosp24_sessions.json")
docs += json_to_docs("data/eurosys25_sessions.json")
        

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.environ["OPENAI_API_KEY"]
)
vector_store = InMemoryVectorStore(embeddings)

In [None]:
llm = ChatOpenAI(
    model="qwen/qwen3-30b-a3b:free",
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["OPENROUTER_API_KEY"]
)

In [None]:
vector_store.add_documents(docs)

In [None]:
prompt = hub.pull("rlm/rag-prompt")

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
response = graph.invoke({"question": "Point out a list of relevant papers for LLM inference"})
print(response["answer"])

In [None]:
# query = "Give a list of relevant papers for LLM inference, LLM serving, serverless, large models."
query = "Large language model (LLM) applications are evolving beyond simple chatbots into dynamic, general-purpose agentic programs, which scale LLM calls and output tokens to help AI agents reason, explore, and solve complex tasks. However, existing LLM serving systems ignore dependencies between programs and calls, missing significant opportunities for optimization. Our analysis reveals that programs submitted to LLM serving engines experience long cumulative wait times, primarily due to head-of-line blocking at both the individual LLM request and the program.  To address this, we introduce Autellix, an LLM serving system that treats programs as first-class citizens to minimize their end-to-end latencies. Autellix intercepts LLM calls submitted by programs, enriching schedulers with programlevel context. We propose two scheduling algorithms—for single-threaded and distributed programs—that preempt and prioritize LLM calls based on their programs’ previously completed calls. Our evaluation demonstrates that across diverse LLMs and agentic workloads, Autellix improves throughput of programs by 4-15× at the same latency compared to state-of-the-art systems, such as vLLM."
docs = vector_store.similarity_search(query, k=10)

In [None]:
for doc in docs:
	print(f"{doc.metadata['title']} ({doc.metadata['session']})")
	print(doc.page_content)
	print(doc.metadata["link"])
	print()

In [None]:
# TODO: Merge this repo with my conference scraper
# TODO: Add the conferences names and dates to the metadata
# TODO: Add more conferences (MLSys, HPCA, ISCA), now I can realistically process all of this information
# TODO: Add separate sets of AI conferences (NeurIPS, ICML, ICLR, etc.)
# TODO: Add additional data sources (e.g. arxiv, email feeds, blogs)
# TODO: Experiment with better / different embedding models
# TODO: Add a way of offline storing the embedding databases for different groups of papers / conferences
# TODO: Run some kind of cronjob that processes data feeds and alerts me of new relevant information, storing any information that was relevant
# TODO: Experiment with better ways of doing the actual similarity search, like what queries I should use.
