In [267]:
import feedparser
import requests
import streamlit as st
from langchain_mistralai import ChatMistralAI
from typing import List, Any, Optional
from utils.api_keys import fetch_api_key
from dataclasses import dataclass
from datetime import datetime
import os
from langchain_community.document_loaders import PyPDFLoader
from sortedcontainers import SortedList
from langchain.schema import Document
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [311]:
def setup_environment():
    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = fetch_api_key("langsmith", False)
    os.environ["MISTRAL_API_KEY"] = fetch_api_key("mistral", False)
    os.environ["HF_TOKEN"] = fetch_api_key("HuggingFace",False)

In [312]:
setup_environment()

Key loading successful.
Key loading successful.
Key loading successful.


In [269]:
@dataclass
class ResearchPaper:
    """Schema for research paper details"""
    id: str
    title: str
    category: str
    authors: List[str]
    author_comment: str
    published: datetime
    summary: str
    link: str
    pdf_url: str
    journal_ref: str

    @property
    def content(self) -> str:
        """Returns a formatted string representation of the paper"""
        return "\n".join([
            f">>> Id: {self.id}",
            f">>> Title: {self.title}",
            f">>> Category: {self.category}",
            f">>> Authors: {', '.join(self.authors)}",
            f">>> Author Comment: {self.author_comment}",
            f">>> Published: {self.published.strftime('%Y-%m-%d')}",
            f">>> Summary: {self.summary}",
            f">>> Link: {self.link}",
            f">>> PDF URL: {self.pdf_url}",
            f">>> Journal Reference: {self.journal_ref}"
        ])

    def pretty_print(self):
        print(self.content)

In [270]:
class PaperManager:
    """Manages a sorted collection of research papers"""
    def __init__(self):
        self.papers = SortedList(key=lambda paper: paper.published)
    def append(self, paper):
        self.papers.add(paper)
    def __getitem__(self, idx):
        return self.papers[idx]

    def get_papers_by_date_range(self, start_date: str, end_date: str) -> List[ResearchPaper]:
        start = datetime.strptime(start_date, "%m-%Y")
        end = datetime.strptime(end_date, "%m-%Y")

        dummy_start = ResearchPaper(
            id="", title="", category="", authors=[],
            author_comment="", published=start, summary="",
            link="", pdf_url="", journal_ref=""
        )
        dummy_end = ResearchPaper(
            id="", title="", category="", authors=[],
            author_comment="", published=end, summary="",
            link="", pdf_url="", journal_ref=""
        )

        start_idx = self.papers.bisect_left(dummy_start)
        end_idx = self.papers.bisect_right(dummy_end)

        return list(self.papers[start_idx:end_idx])

In [271]:
class ArXivRetriever:
    """Handles retrieval of papers from ArXiv API"""
    def __init__(self, api_url: str ="http://export.arxiv.org/api/query?"):
        self.api_url = api_url

    def fetch_papers(self, query: str, max_results: int=5):
        query_url = f"{self.api_url}search_query=all:{query}&max_results={max_results}"
        try:
            response = requests.get(query_url, timeout=10)
            response.raise_for_status()
            feed = feedparser.parse(response.content)
            papers = PaperManager()
            for entry in feed.entries:
                paper_id = entry.id.split("/")[-1]
                paper = ResearchPaper(
                    id = paper_id,
                    title = entry.title,
                    category = entry.category,
                    authors = [author.name for author in entry.authors],
                    author_comment = entry.get("arxiv_comment", "No comments available"),
                    published = datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ"),
                    summary = entry.summary,
                    link =entry.link,
                    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf",
                    journal_ref = entry.get("arxiv_journal_ref", "No journal reference")
                )
                papers.append(paper)
            return papers
        except requests.exceptions.RequestException as e:
            st.error("Error fetching papers. Please try again later.")
            return PaperManager()

In [272]:
class PaperSearchAssistant:
    """Assistant that uses LLM to optimize search queries and fetch papers"""

    def __init__(self, llm: Any):
        self.llm = llm
        self.arxiv_retriever = ArXivRetriever()

        self.prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a research paper search assistant. Your task is to:
            1. Analyze the user's query
            2. Correct any spelling mistakes
            3. Format it into an optimal search query for ArXiv
            4. Return only the reformulated query without any additional text or explanations.

            Make sure to:
            - Preserve technical terms and acronyms
            - Use Boolean operators (AND, OR) when appropriate
            - Include relevant synonyms for important terms"""),
            ("user", "{query}")
        ])


    def search_papers(self, query: str,
                      max_results: int = 5) -> List[ResearchPaper]:
        """Search for papers on ArXiv and return List of ResearchPaper objects"""
        chain = self.prompt | self.llm
        cleaned_query = chain.invoke({"query": query}).content
        papers = self.arxiv_retriever.fetch_papers(cleaned_query, max_results=max_results)
        return papers

In [307]:
class PaperAssistant:
    """
    A class for reading PDFs and answering questions using RAG (Retrieval Augmented Generation).
    """

    def __init__(
        self,
        paper: ResearchPaper,
        llm: Any,
        embedding_model: Optional[Any]=None,
        k: int = 10,
        cache_dir: Optional[str] = None
    ):
        """
        Initialize the PDF reader with a language model and document source.
        """
        self.paper = paper
        self.llm = llm
        self.embedding_model = embedding_model
        self.k = k
        self.cache_dir = cache_dir
        self.docs: List[Document] = []

        self._initialize_vectorstore()
        self._setup_rag_chain()

    def _initialize_vectorstore(self) -> None:
        """
        Initialize the vector store with document embeddings.
        """
        try:
            self.docs = self._pdf_to_documents(self.paper.pdf_url)
            if not self.docs:
                raise ValueError("No content extracted from PDF")
            if not self.embedding_model:
                self.embedding_model = HuggingFaceEmbeddings( model_name = "sentence-transformers/all-MiniLM-L6-v2")
            self.vectorstore = Chroma.from_documents(documents=self.docs, embedding=self.embedding_model)
            self.retriever = self.vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": self.k})

        except Exception as e:
            raise ValueError(f"Failed to initialize vector store: {str(e)}")

    def _setup_rag_chain(self) -> None:
        """
        Set up the RAG chain for question answering.
        """
        system_prompt = f"""
        You are an assistant for answering questions about the research paper titled:
        "{self.paper.title}"

        Paper Context (Reference only when relevant):
        1. Category: {self.paper.category}
        2. Authors: {', '.join(self.paper.authors)}
        3. Published in {self.paper.published.strftime('%Y-%m-%d')}

        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, say that you don't know.
        Use three sentences maximum and keep the answer concise.

        Context:
        {{context}}
        """

        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{input}")
        ])

        question_answer_chain = create_stuff_documents_chain(self.llm, prompt)
        self.rag_chain = create_retrieval_chain( self.retriever, question_answer_chain)

    @staticmethod
    def _pdf_to_documents(url) -> List[Document]:
        loader = PyPDFLoader(url)
        pages = []
        for page in loader.lazy_load():
                pages.append(page)
        return pages

    async def a_answer(self, question: str) -> Optional[dict]:
        try:
            response = await self.rag_chain.ainvoke({
                "input": question
            })
            return response
        except Exception as e:
            raise ValueError(f"Failed to process question: {str(e)}")

    def answer(self, question: str) -> Optional[dict]:
        try:
            response = self.rag_chain.invoke({
                "input": question
            })
            return response
        except Exception as e:
            raise ValueError(f"Failed to process question: {str(e)}")

    def pages(self) -> int:
        """Return the number of pages in the pdf file."""
        return len(self.docs)

    def get_average_chunk_size(self) -> float:
        """Return the average chunk size in characters."""
        if not self.docs:
            return 0
        return sum(len(doc.page_content) for doc in self.docs) / len(self.docs)

    def get_paper_stats(self) -> dict:
        """
        Get comprehensive statistics about the paper.

        Returns:
            Dictionary containing paper statistics and metadata
        """
        word_counts = [len(doc.page_content.split()) for doc in self.docs]
        return {
            "paper_info": {
                "title": self.paper.title,
                "authors": self.paper.authors,
                "category": self.paper.category,
                "published_date": self.paper.published.strftime('%Y-%m-%d'),
                "journal_ref": self.paper.journal_ref
            },
            "content_stats": {
                "total_pages": self.pages(),
                "total_words": sum(word_counts),
                "average_words_per_page": round(sum(word_counts) / len(word_counts) if word_counts else 0, 2),
                "average_chunk_size": round(self.get_average_chunk_size(), 2)
            },
            "processing_info": {
                "embedding_model": self.embedding_model,
                "retrieval_chunks": self.k,
                "vectorstore_documents": len(self.docs)
            }
        }

In [283]:
llm = ChatMistralAI(model = "mistral-large-2402")
assistant = PaperSearchAssistant(llm = llm)

In [284]:
query = "Meuchanical peokperties of Still"
results = assistant.search_papers(query)

In [285]:
results[0].pretty_print()

>>> Id: 1202.4135v1
>>> Title: Nanoscale austenite reversion through partitioning, segregation, and
  kinetic freezing: Example of a ductile 2 GPa Fe-Cr-C steel
>>> Category: cond-mat.mtrl-sci
>>> Authors: L. Yuan, D. Ponge, J. Wittig, P. Choi, J. A. Jiménez, D. Raabe
>>> Author Comment: in press Acta Materialia 2012
>>> Published: 2012-02-19
>>> Summary: Austenite reversion during tempering of a Fe-13.6Cr-0.44C (wt.%) martensite
results in an ultrahigh strength ferritic stainless steel with excellent
ductility. The austenite reversion mechanism is coupled to the kinetic freezing
of carbon during low-temperature partitioning at the interfaces between
martensite and retained austenite and to carbon segregation at
martensite-martensite grain boundaries. An advantage of austenite reversion is
its scalability, i.e., changing tempering time and temperature tailors the
desired strength-ductility profiles (e.g. tempering at 400{\deg}C for 1 min.
produces a 2 GPa ultimate tensile strength (UTS

In [286]:
results[0].pdf_url

'https://arxiv.org/pdf/1202.4135v1.pdf'

In [287]:
file_path = results[0].pdf_url

In [288]:
loader = PyPDFLoader(file_path)

In [289]:
file_path

'https://arxiv.org/pdf/1202.4135v1.pdf'

In [172]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [173]:
pages

[Document(metadata={'source': 'https://arxiv.org/pdf/1202.4135v1.pdf', 'page': 0}, page_content='1 \n \nNanoscale austenite reversion through partitioning, segregation, \nand kinetic freezing: Example of a ductile 2 GPa Fe-Cr-C steel \n \n \n \nL. Yuan1, D. Ponge1, J. Wittig1,2, P. Choi1, J. A. Jiménez3, D. Raabe1 \n \n1  Max-Planck-Institut für Eisenforschung, Max-Planck-Str. 1, 40237 Düsseldorf, Germany \n2  Vanderbilt University, Nashville, TN 37235-1683, USA \n3  CENIM-CSIC, Avda. Gregorio del Amo 8, 28040-Madrid, Spain \n \n \n \nAbstract  \nAustenite reversion during tempering of a Fe -13.6Cr-0.44C (wt.%) martensit e results in an ultra -\nhigh strength ferritic stainless steel with excellent ductility. The austenite reversion mechanism is \ncoupled to the kinetic freezing of carbon during low-temperature partitioning at the interfaces \nbetween martensite and retained austenite and to carbon segregation at martensite -martensite grain \nboundaries. An advantage of austenite reve

In [176]:
print(f"{pages[1].metadata}\n")
print(pages[1].page_content)

{'source': 'https://arxiv.org/pdf/1202.4135v1.pdf', 'page': 1}

2 
 
  
1. Introduction 
A high demand exists for lean, ductile, and high strength Fe-Cr stainless steels in the fields of 
energy conversion, mobility, and industrial infrastructure . As c onventional martensitic stainless 
steels (MSS) typically exhibit brittle behavior, supermartensitic Fe-Cr stainless steels (SMSS) with 
enhanced ductility  have been designed in the past years by reducing carbon (<0.03  wt.%) and 
adding nickel (4%-6.5 wt.%) and molybdenum (2.5  wt.%) [1-4]. The heat-treated microstructures of 
these materials are characterized by tempered martensite and retained austenite[1-4].  
In this work  we present an alternative approach of designing MSS steels with both, high strength 
and ductility. Our method is based on nanoscale austenite reversion and martensite relaxation  via a 
modest heat treatment at 300-500°C for several minutes . We make the surprising observation that 
this method leads to very hi

Key loading successful.


In [183]:
embedding_model  = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

In [184]:
vectorstore = Chroma.from_documents(documents= pages, embedding=embedding_model,)

In [185]:
vectorstore.similarity_search("ductility", k=3)

[Document(metadata={'page': 16, 'source': 'https://arxiv.org/pdf/1202.4135v1.pdf'}, page_content='17 \n \nstress-true strain curves and their corresponding derivatives (strain hardening) after 400°C heat \ntreatment at different times. The data reveal that the tempering, which increases the austenite \ncontent via reversion, leads indeed to higher strain hardening reserves at the later stages of \ndeformation due t o the TRIP effect, Fig. 6b. Since longer heat treatments lead to higher volume \nfractions of reverted austenite the TRIP -related strain hardening assumes a higher level for these \nsamples (Fig. 10). \nAnother important effect that might promote ductility in this context is the wide distribution of the \naustenite dispersion and stability (carbon content) which are both characteristic for this material. As \nrevealed in Fig. 8 we can differentiate 3 types of austenite, Fig. 9a: The first type is the as-quenched \nretained austenite with the nominal carbon content and relat

In [189]:
pages[8]

Document(metadata={'source': 'https://arxiv.org/pdf/1202.4135v1.pdf', 'page': 8}, page_content='9 \n \nto the variation in the carbon distribution, the chromium content is the same in the martensite, the \ninterface, and the austenite, Fig. 8a. \n \n3.4.2   400°C tempered condition after quenching \nAfter 1 minute tempering at 400°C, a carbon enriched austenite layer (15-20 nm width) is observed \nbetween two abutting martensite regions  (Fig. 8b). The thin austenite zone contains in average \nabout 6.86 at.% carbon  while the martensite matrix contains only about 0.82 at.% carbon. The \nidentification of the phases in these diagrams follows their characteristic carbon content.  \nAfter 30 minutes tempering  (Fig. 8c), different carbon enriched areas appear. They correspond to \nindividual phases. The analyzed volume can be divided into 2 zones. The top region with low \ncarbon content corresponds to martensite. The bottom zone with higher carbon content corresponds \nto austenite. Ins

In [186]:
retriever = vectorstore.as_retriever(search_type = "similarity", search_kwargs = {"k":10})

In [190]:
retrieved_docs = retriever.invoke("What is the role of carbon partitioning in the tempering process, and how does it affect the microstructure of martensite and austenite during tempering at 400°C?")

In [257]:
retrieved_docs[0].page_content

'13 \n \nlayer in Fig. 8b is about 15 nm. With increasing tempering time, more reverted austenite is formed \n(Fig. 4,9).  \nIn summary, t he behavior of carbon in the current alloy can be described as follows: during \nquenching, carbon segregate s to martensite-martensite grain boundaries (equilibrium segregation) \nor to martensite-retained austenite interfaces (partitioning plus kinetic freezing) . In the first case \n(equilibrium segregation between two lath martensite zones) d uring tempering, these carbon  \nenriched areas in the martensite revert to austenite  when the driving force is high enough owing to \nthe favorable nucleation barrier at the interfaces . In the second case (partitioning at retained \naustenite) the carbon enrichment leads to austenite growth  according to local equilibrium. If the so \nreverted austenite is located at or in the vicinity of the austenite -martensite phase boundary, carbon \ncan diffuse from the reverted austenite further into the retained 

In [194]:
llm

ChatMistralAI(client=<httpx.Client object at 0x0000018D3118BF50>, async_client=<httpx.AsyncClient object at 0x0000018D3118B5F0>, mistral_api_key=SecretStr('**********'), endpoint='https://api.mistral.ai/v1', model='mistral-large-2402')

In [206]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [207]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [208]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [211]:
response = rag_chain.invoke({"input":"What is the role of carbon partitioning in the tempering process, and how does it affect the microstructure of martensite and austenite during tempering at 400°C?"})

In [229]:
type(response)

dict

In [308]:
reader = PaperAssistant(paper = results[0],
                   llm= llm,
                   )

In [309]:
ans = await reader.a_answer("What is the role of carbon partitioning in the tempering process, and how does it affect the microstructure of martensite and austenite during tempering at 400°C?")

In [310]:
ans["answer"]

"During the tempering process, carbon partitioning plays a significant role in the microstructure evolution of martensite and austenite in the Fe-Cr-C steel. Carbon segregates to martensite-martensite grain boundaries or martensite-retained austenite interfaces during quenching. When tempered at 400°C, carbon-enriched areas in the martensite revert to austenite due to a favorable nucleation barrier at the interfaces. This reverted austenite has a high carbon content, which can further diffuse into the retained austenite if it is located near the austenite-martensite phase boundary, leading to its stabilization. However, if the diffusion distance is too far, carbides form inside the martensite due to the high concentration of carbon. The continuous decrease in martensite's carbon content during tempering results from carbon partitioning to austenite, austenite reversion, and carbide formation. Medium-range diffusion of carbon is observed, while substitutional elements experience only sh