# LCEL and chains

In [1]:
%load_ext autoreload
%autoreload 2

Failed to read module file 'C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\functools.py' for module 'functools': UnicodeDecodeError
Traceback (most recent call last):
  File "c:\Users\manuelalberto.romero\Documents\repos\dslabs\dslab-rag-e2e\.venv\Lib\site-packages\IPython\core\extensions.py", line 62, in load_extension
    return self._load_extension(module_str)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\manuelalberto.romero\Documents\repos\dslabs\dslab-rag-e2e\.venv\Lib\site-packages\IPython\core\extensions.py", line 77, in _load_extension
    mod = import_module(module_str)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\importlib\__init__.py", line 90, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen impor

In [2]:
import os
from dotenv import load_dotenv

from langchain_core.prompts import ChatPromptTemplate, PromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.runnables import RunnableConfig
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_mistralai import  ChatMistralAI
from src import utils, conf

# Params

In [3]:
conf_settings = conf.load(file="settings.yaml")
conf_settings

LLM_WORKHORSE = conf_settings.llm_workhorse
LLM_FLAGSHIP = conf_settings.llm_flagship
EMBEDDINGS = conf_settings.embeddings

# Environment Variables

In [4]:
load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# What is a Langchain chain

It is a composition element that allow to build an structured pipeline to perform IA Generative tasks, specially (but not only) for RAGs


Langchain chains are built (in version 1.x or above) using LCEL (LangChain Expression Language)

Its core principles are: composability, streaming, async, parallelism

The main chains are abstractions layers for:
* LLMs
* Prompts
* VectorStores (Retriever + Embedding)
* Embeddings



## ChatPromptTemplate

In [5]:
lst_chat_hist = [
        ("system", "You are a helpful assistant."),
        ("user", "{question}")  # variables syntax
]

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("user", "{question}")
])

prompt.invoke("Can you tell me the distance from the Earth to the Moon?")

ChatPromptValue(messages=[SystemMessage(content='You are a helpful assistant.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Can you tell me the distance from the Earth to the Moon?', additional_kwargs={}, response_metadata={})])

In [6]:
prompt = ChatPromptTemplate.from_template("Tell me an interesting fact about {topic}")

prompt.format_prompt(topic="Devops")

ChatPromptValue(messages=[HumanMessage(content='Tell me an interesting fact about Devops', additional_kwargs={}, response_metadata={})])

In [7]:
prompt.invoke({"topic": "Devops"})

ChatPromptValue(messages=[HumanMessage(content='Tell me an interesting fact about Devops', additional_kwargs={}, response_metadata={})])

In [8]:
prompt.invoke(input={"topic": "Devops"})

ChatPromptValue(messages=[HumanMessage(content='Tell me an interesting fact about Devops', additional_kwargs={}, response_metadata={})])

In [9]:
try:
    prompt.invoke(topic="Devops")
except Exception as err:
    print(err)

BasePromptTemplate.invoke() missing 1 required positional argument: 'input'


## FewShotChatMessagePromptTemplate

In [10]:
# 1. Define examples
examples = [  # input/output keys
    {"input": "Q: What is LangChain?", "output": "A: LangChain is a framework for building applications powered by large language models (LLMs)."},
    {"input": "Q: What is LCEL?", "output": "A: LCEL (LangChain Expression Language) is a way to build chains using composable operators like | for clarity and power."},
]

# 2. Create an example prompt template: input/output keys
example_prompt = ChatPromptTemplate.from_messages([
    ("human", "{input}"),
    ("ai", "{output}")
])

# 3. Few-shot wrapper
few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
)

# 4. Final prompt template (instructions + few-shots + new user question)
final_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a concise AI assistant. Answer clearly.\
     The answer style should be like the following examples:"),
    few_shot_prompt,
    ("human", "{question}")
])

In [11]:
example_prompt.invoke(examples[0])

ChatPromptValue(messages=[HumanMessage(content='Q: What is LangChain?', additional_kwargs={}, response_metadata={}), AIMessage(content='A: LangChain is a framework for building applications powered by large language models (LLMs).', additional_kwargs={}, response_metadata={}, tool_calls=[], invalid_tool_calls=[])])

In [12]:
(few_shot_prompt
          .format_prompt() 
          .to_messages()
)

[HumanMessage(content='Q: What is LangChain?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='A: LangChain is a framework for building applications powered by large language models (LLMs).', additional_kwargs={}, response_metadata={}, tool_calls=[], invalid_tool_calls=[]),
 HumanMessage(content='Q: What is LCEL?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='A: LCEL (LangChain Expression Language) is a way to build chains using composable operators like | for clarity and power.', additional_kwargs={}, response_metadata={}, tool_calls=[], invalid_tool_calls=[])]

In [13]:
final_prompt.invoke("What is langgraph?").to_messages()

[SystemMessage(content='You are a concise AI assistant. Answer clearly.     The answer style should be like the following examples:', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Q: What is LangChain?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='A: LangChain is a framework for building applications powered by large language models (LLMs).', additional_kwargs={}, response_metadata={}, tool_calls=[], invalid_tool_calls=[]),
 HumanMessage(content='Q: What is LCEL?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='A: LCEL (LangChain Expression Language) is a way to build chains using composable operators like | for clarity and power.', additional_kwargs={}, response_metadata={}, tool_calls=[], invalid_tool_calls=[]),
 HumanMessage(content='What is langgraph?', additional_kwargs={}, response_metadata={})]

# LLMs

In [14]:
from openai import OpenAI  # native

client_openai = OpenAI(
    api_key=OPENAI_API_KEY
)

response = client_openai.responses.create(
    model=LLM_WORKHORSE,
    input="Tell me a joke about devops",
    temperature=0.2,
    max_output_tokens=128,
)

print(response.output_text)


Sure! Here's a DevOps joke for you:

Why do DevOps engineers always carry a ladder?

Because they‚Äôre always working on the deployment pipeline! üòÑ


In [15]:
llm = ChatOpenAI(  # Langchain wrapper
    model=LLM_WORKHORSE,
    # temperature=0.2,
    max_tokens=128,
    )

# How to call the LLM in langchain?

llm.invoke("Tell me a joke about devops")


AIMessage(content='Why did the DevOps engineer go broke?\n\nBecause he kept losing containers! üö¢üòÑ', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 14, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_75546bd1a7', 'id': 'chatcmpl-D8lhswT4rQU2p4Y27XGOlRU8kUpXC', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--019c56c6-943c-7de3-88a6-18ed58627563-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 14, 'output_tokens': 19, 'total_tokens': 33, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Calling a Chain

**invoke (synchronous single input)**

* Runs the chain once, blocking until it finishes.
* Input = single dict or string (depending on your chain).
* Output = single result.

‚úÖ Use when you just need one response and don‚Äôt care about concurrency.

In [17]:
result = llm.invoke("What is LangChain?")
print(result)

content='LangChain is a framework designed to simplify the development of applications powered by large language models (LLMs). It provides tools and abstractions to help developers build complex workflows that combine LLM calls with other components such as data sources, APIs, and external computation. LangChain facilitates tasks like prompt management, chaining multiple LLM calls, integrating with document loaders and vector databases, and managing conversation memory, making it easier to create applications like chatbots, question-answering systems, and other AI-driven tools.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 100, 'prompt_tokens': 12, 'total_tokens': 112, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14',

**ainvoke (asynchronous single input)**

* Async version of invoke.
* Returns a coroutine ‚Üí you must await it (inside async def).
* Non-blocking ‚Üí allows parallel I/O (important for web apps, APIs).

‚úÖ Use when building async applications (FastAPI, Streamlit, etc.) or when you want multiple requests in parallel.

```python
import asyncio

async def main():
    result = await llm.ainvoke({"question": "What is LCEL?"})
    print(result)

asyncio.run(main())
```

**batch (synchronous multiple inputs)**

* Run the chain on a list of inputs (e.g., multiple questions).
* Executes them one by one under the hood (but can be parallelized with config).
* Returns a list of results in the same order.

‚úÖ Use when you have a list of tasks and don‚Äôt need async.

In [18]:
questions = [
    "What is LangChain?",
    "What is LCEL?",
    "What is a vector database?"
]

results = llm.batch(questions,
                    config=RunnableConfig(max_concurrency=10),
                    )
for r in results:
    print(r)

content='**LangChain** is an open-source framework designed to help developers build applications powered by large language models (LLMs). It provides tools and abstractions that simplify the integration of LLMs with other data sources and APIs, enabling the creation of more complex and functional language model applications.\n\n### Key Features of LangChain:\n- **Prompt Management:** Helps design, manage, and optimize prompts sent to LLMs.\n- **Chains:** Allows chaining together multiple calls to LLMs or other components to create complex workflows.\n- **Memory:** Maintains conversational state or context over multiple interactions.\n- **Data Augmentation:** Integrates LLM' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 128, 'prompt_tokens': 12, 'total_tokens': 140, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_token

**There is also:**
* abatch ‚Üí async version of batch.

**stream (synchronous streaming)** 
* Instead of waiting for the entire response, you get tokens/chunks as they arrive.
* Great for CLI apps or cases where you want immediate output.


```python
# Streaming call
for chunk in chain.stream({"question": "Explain LangChain Expression Language in simple terms."}):
    print(chunk, end="", flush=True)
```

Here, token by token results are returned as generated, and the application is blocked. It is usefull when developing a CLI

** astream (asynchronous streaming) **
* Same as stream, but async-friendly.
* Perfect for web apps (FastAPI, Streamlit, etc.) where you want token-by-token output and not block the application.

```python 
import asyncio

async def main():
    async for chunk in chain.astream({"question": "Give me a short poem about LCEL."}):
        print(chunk, end="", flush=True)

    print("\n---\nDone!")

asyncio.run(main())
```


| Method    | Input       | Output style              | Use case                   |
| --------- | ----------- | ------------------------- | -------------------------- |
| `invoke`  | 1 input     | 1 final result            | Simple calls               |
| `ainvoke` | 1 input     | 1 final result            | Async apps                 |
| `batch`   | many inputs | list of results           | Bulk jobs                  |
| `abatch`  | many inputs | list of results           | Async bulk                 |
| `stream`  | 1 input     | generator of chunks       | CLI / sync streaming       |
| `astream` | 1 input     | async generator of chunks | Web apps / async streaming |


# Chaining:

* Chaining means linking multiple components (prompt templates, LLMs, output parsers, retrievers, tools, etc.) together into a pipeline.
* The pipe operator (|) is the heart of LCEL ‚Äî it lets you compose these components like LEGO blocks.
* Each component is a Runnable (anything that can accept input and produce output).

In [19]:
prompt = ChatPromptTemplate.from_template("Tell me an interesting fact about {topic}")

llm = ChatOpenAI(
    model=LLM_WORKHORSE,
    temperature=0.2,
    max_tokens=128,
    )


chat = prompt | llm 

chat.invoke(input="Roman Empire")

AIMessage(content='An interesting fact about the Roman Empire is that it had an extensive and sophisticated network of roads‚Äîover 250,000 miles at its peak! These roads were so well constructed that many of them are still in use today. The phrase "All roads lead to Rome" comes from this impressive infrastructure, which helped the Romans efficiently manage their vast empire, enabling rapid military movement, trade, and communication across Europe, North Africa, and the Middle East.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 90, 'prompt_tokens': 15, 'total_tokens': 105, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_75546bd1a7', 'id': 'chatcmpl-D8liVr6fje0jhgJLc9xKTx5cArlce', 'service_

In [20]:
chat.invoke(input={"topic": "Roman Empire"})

AIMessage(content='An interesting fact about the Roman Empire is that it had an extensive and sophisticated network of roads‚Äîover 250,000 miles at its peak! These roads were so well constructed that some are still in use today. The phrase "All roads lead to Rome" comes from this impressive infrastructure, which helped the Romans efficiently manage their vast empire by facilitating trade, military movement, and communication.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 77, 'prompt_tokens': 15, 'total_tokens': 92, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_75546bd1a7', 'id': 'chatcmpl-D8lidCcLT6JHQB11rItpHWkQz0EBy', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': Non

In [21]:
type(prompt)

langchain_core.prompts.chat.ChatPromptTemplate

# Output Parsers

In [80]:
from langchain_core.output_parsers import StrOutputParser

chain = (llm
    | StrOutputParser()
)

chain.invoke("Tell me a joke about devops")


"Sure! Here's a DevOps joke for you:\n\nWhy do DevOps engineers prefer dark mode?\n\nBecause light attracts bugs! üêõüòÑ"

In [83]:
data= """
A LA ATT. DE SEGUROS LLOYD:
EN DON BENITO, A 08 MARZO 2014

YO, CARMEN ESPA√ëOLA ESPA√ëOLA,
CON DNI 99999999R, QUIERO DARME
DE BAJA DEL SEGURO DE COCHE QUE
TENGO CON USTEDES POR LA VENTA DEL
MISMO.

EL N√öMERO DE P√ìLIZA ES h2024038
Y SE CORRESPONDE CON UN OPEL CORSA 1.2L
CON MATR√çCULA 5473 BXM

[signature]

Dta. CARMEN ESPA√ëOLA
ESPA√ëOLA
"""


In [84]:
from pydantic import BaseModel, Field
from typing import Optional


class DatosPoliza(BaseModel):
    fecha: Optional[str] = Field(description="Si existe, la fecha a la que se firma la solicitud")
    dni:  Optional[str] = Field(description="Si existe, el DNI o n√∫mero de pasarporte del solicitante")
    nro_poliza:  Optional[str]  = Field(description="Si existe, el n√∫mero de p√≥liza sobre el que el solicitante desea realizar una acci√≥n")
    marca_model:  Optional[str] = Field(description="Si aparece, la marca y/o el modelo del veh√≠culo asociado a la p√≥liza")
    matricula:  Optional[str] = Field(description="Si aparece, la matr√≠cula veh√≠culo asociado a la p√≥liza")


In [85]:
# Configure the model to output structured data using the Pydantic model
llm_with_struct_outputs = llm.with_structured_output(DatosPoliza)

prompt = ChatPromptTemplate.from_messages([
    {"role": "system", 
     "content": "Eres un analista que se dedica al cribado de solicitudes de seguros. Tu misi√≥n es leer detenidamente el correo de un asegurado y extraer informaci√≥n clave)"},
    {"role": "human", "content": "Correo de solicitud {data}."},
])

In [86]:
# Generate a structured response by invoking the RunnableSequence
response = llm_with_struct_outputs.invoke("What's the weather in Paris?")

In [88]:
chain_policy_parsing = prompt | llm_with_struct_outputs

chain_policy_parsing.invoke({"data": data})

DatosPoliza(fecha='08 MARZO 2014', dni='99999999R', nro_poliza='h2024038', marca_model='OPEL CORSA 1.2L', matricula='5473 BXM')

# Other components

## Embeddings

In [25]:
# Embeddins!
embeddings = OpenAIEmbeddings()
q_vec = embeddings.embed_query("Tell me a joke about devops")
len(q_vec)

1536

## VectorDB and Retriever

### Create a client

In [31]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client_qdrant = QdrantClient(":memory:")

try:
    response= client_qdrant.get_collections()
    lst_collections = response.collections
    print(f"{len(lst_collections)=}")
except Exception as err:
    print(err)

len(lst_collections)=0


### Create an index (collection)

In [None]:

EMB_DIM = len(q_vec)
# #############################################
if client_qdrant.collection_exists("tutorial"):
    client_qdrant.delete_collection("tutorial")
# #############################################

client_qdrant.create_collection(
    collection_name="tutorial",
    vectors_config=VectorParams(
        size=EMB_DIM,
        distance=Distance.COSINE),
)


### Load docs

In [38]:
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore


docs = [
    Document(
        page_content="John J. Hopfield and Geoffrey Hinton received the Nobel Prize in Physics in 2024 for their groundbreaking work on artificial neural networks, a foundation of modern AI. Hopfield developed an associative memory model in the 1980s that allows networks to store and reconstruct patterns. Building on this, Hinton developed the Boltzmann machine, which uses statistical physics principles to recognize and classify data. These pioneering contributions are essential for today's machine learning technologies, enhancing applications from medical imaging to material science.",
        metadata={"source": "wikipedia", "topic": "Physics"}
    ),
    Document(
        page_content="In Chemistry, David Baker, Demis Hassabis, and John Jumper were honored win Nobel Prize in 2024 for their breakthroughs in protein structure prediction. Baker‚Äôs work in computational protein design enables the creation of novel proteins, while Hassabis and Jumper, known for their work with DeepMind's AlphaFold, developed an AI that accurately predicts protein structures‚Äîa long-standing challenge in biology. This advancement could lead to transformative applications in drug development and synthetic biology.",
        metadata={"source": "wikipedia", "topic": "Chemistry"}
    ),
]



# This example is wrong: https://qdrant.tech/documentation/frameworks/langchain/#using-an-existing-collection
# Use embedding instead of embeddings, like in Langchain documentation:
# https://python.langchain.com/api_reference/_modules/langchain_qdrant/qdrant.html#QdrantVectorStore.from_existing_collection

# from memory:
vector_store = QdrantVectorStore(
    client=client_qdrant,
    collection_name="tutorial",
    embedding=embeddings,
)
# uuids = [str(uuid.uuid4()) for _ in range(len(docs))]

uuids = [
    '2690cf82-ebfd-48bc-bd52-c61a595a212a',
    '0e8f454e-3ebf-434b-a7cf-26489695bcd0'
    ]


vector_store.add_documents(documents=docs, ids=uuids)  # Add only once!

['2690cf82-ebfd-48bc-bd52-c61a595a212a',
 '0e8f454e-3ebf-434b-a7cf-26489695bcd0']

### Query the vector store

In [39]:
vector_store.similarity_search("Who is Geoffrey Hinton")

[Document(metadata={'source': 'wikipedia', 'topic': 'Physics', '_id': '2690cf82-ebfd-48bc-bd52-c61a595a212a', '_collection_name': 'tutorial'}, page_content="John J. Hopfield and Geoffrey Hinton received the Nobel Prize in Physics in 2024 for their groundbreaking work on artificial neural networks, a foundation of modern AI. Hopfield developed an associative memory model in the 1980s that allows networks to store and reconstruct patterns. Building on this, Hinton developed the Boltzmann machine, which uses statistical physics principles to recognize and classify data. These pioneering contributions are essential for today's machine learning technologies, enhancing applications from medical imaging to material science."),
 Document(metadata={'source': 'wikipedia', 'topic': 'Chemistry', '_id': '0e8f454e-3ebf-434b-a7cf-26489695bcd0', '_collection_name': 'tutorial'}, page_content="In Chemistry, David Baker, Demis Hassabis, and John Jumper were honored win Nobel Prize in 2024 for their break

### Retriever

A langchain vector_store is not a `chain` object and does not have a `.invoke()` method
We need to cast it to `retriever` that is the abstraction layer over the vector_store 
to query documents (contexts)

In [40]:
retriever = vector_store.as_retriever(k=1)

retriever.invoke("Nobel Price Physics")

[Document(metadata={'source': 'wikipedia', 'topic': 'Chemistry', '_id': '0e8f454e-3ebf-434b-a7cf-26489695bcd0', '_collection_name': 'tutorial'}, page_content="In Chemistry, David Baker, Demis Hassabis, and John Jumper were honored win Nobel Prize in 2024 for their breakthroughs in protein structure prediction. Baker‚Äôs work in computational protein design enables the creation of novel proteins, while Hassabis and Jumper, known for their work with DeepMind's AlphaFold, developed an AI that accurately predicts protein structures‚Äîa long-standing challenge in biology. This advancement could lead to transformative applications in drug development and synthetic biology."),
 Document(metadata={'source': 'wikipedia', 'topic': 'Physics', '_id': '2690cf82-ebfd-48bc-bd52-c61a595a212a', '_collection_name': 'tutorial'}, page_content="John J. Hopfield and Geoffrey Hinton received the Nobel Prize in Physics in 2024 for their groundbreaking work on artificial neural networks, a foundation of modern

## Doc Loaders

```python
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType


FILE_PATH = [(path_input / "Divulgacion-Planetaria-Althera.pdf").as_posix()]  # for multiple files
EXPORT_TYPE = ExportType.MARKDOWN   # ExportType.DOC_CHUNKS
TOKENIZER_NAME ="cl100k_base"

loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
)

lst_docs = loader.load()
```

In [66]:

def read_md(path, encoding="utf-8"):
    with open(path, "r", encoding=encoding) as f:
        doc_md = f.read()
    return doc_md


from IPython.display import display, Markdown
doc_md = read_md("data/interim/Divulgacion-Planetaria-Althera.md")

Markdown(doc_md[:1000])

# Un nuevo y fascinante vecino: Alth√©ra

# √çndice

1. Historia del descubrimiento
2. Conoce a Alth√©ra
3. Los soles de Alth√©ra
4. Estructura general de Alth√©ra
5. Planetas interiores
6. Planetas exteriores
7. Lunas y sat√©lites menores
8. Fen√≥menos destacados
9. Habitabilidad y astrobiolog√≠a
10. Conclusiones y perspectivas futuras

# 1. Historia del descubrimiento

## 1.1 Primeras observaciones y sospechas iniciales

El sistema binario Alth√©ra ( HD 4579 AB ) fue detectado por primera vez en el a√±o 2032 durante una campa√±a de observaci√≥n del Observatorio Espacial James Webb , dirigida por la astrof√≠sica chilena Dra. Mariela Estay . La misi√≥n principal era estudiar la composici√≥n atmosf√©rica de exoplanetas candidatos a la habitabilidad, pero un patr√≥n an√≥malo en el flujo luminoso proveniente de la constelaci√≥n de Ori√≥n llam√≥ la atenci√≥n del equipo. El an√°lisis de curvas de luz revel√≥ oscilaciones peri√≥dicas dobles, un indicio claro de la presencia de dos estrellas en √≥rbita mutua y varios 

In [67]:
corpus = [Document(
    page_content=doc_md,
    metadata={
        "source": "Divulgacion-Planetaria-Althera.md"
    }
)]

### Text Splitters

In [68]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

TOKENIZER_NAME ="cl100k_base"

text_splitter_rcs = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name=TOKENIZER_NAME,
    chunk_size=256,  # from_tiktoken_encoder: tokens
    chunk_overlap=25  # tokens
)
corpus_rcs = text_splitter_rcs.split_documents(corpus)  # corpus is a list of LG Documents
len(corpus_rcs)

51

In [69]:


Markdown(corpus_rcs[0].page_content)

# Un nuevo y fascinante vecino: Alth√©ra

# √çndice

1. Historia del descubrimiento
2. Conoce a Alth√©ra
3. Los soles de Alth√©ra
4. Estructura general de Alth√©ra
5. Planetas interiores
6. Planetas exteriores
7. Lunas y sat√©lites menores
8. Fen√≥menos destacados
9. Habitabilidad y astrobiolog√≠a
10. Conclusiones y perspectivas futuras

# 1. Historia del descubrimiento

## 1.1 Primeras observaciones y sospechas iniciales

In [70]:
Markdown(corpus_rcs[1].page_content)

## 1.1 Primeras observaciones y sospechas iniciales

El sistema binario Alth√©ra ( HD 4579 AB ) fue detectado por primera vez en el a√±o 2032 durante una campa√±a de observaci√≥n del Observatorio Espacial James Webb , dirigida por la astrof√≠sica chilena Dra. Mariela Estay . La misi√≥n principal era estudiar la composici√≥n atmosf√©rica de exoplanetas candidatos a la habitabilidad, pero un patr√≥n an√≥malo en el flujo luminoso proveniente de la constelaci√≥n de Ori√≥n llam√≥ la atenci√≥n del equipo. El an√°lisis de curvas de luz revel√≥ oscilaciones peri√≥dicas dobles, un indicio claro de la presencia de dos estrellas en √≥rbita mutua y varios cuerpos orbitando de forma circumbinaria.

## 1.2 Confirmaci√≥n mediante t√©cnicas combinadas

En los meses siguientes, un consorcio internacional liderado por la Agencia Espacial Europea (ESA) y el Instituto Max Planck de Astronom√≠a despleg√≥ observaciones complementarias utilizando:

In [71]:
Markdown(corpus_rcs[2].page_content)

- ELT (Extremely Large Telescope) en el Desierto de Atacama para espectroscop√≠a de alta resoluci√≥n.
- Telescopio Espacial Nancy Grace Roman para fotometr√≠a de gran precisi√≥n en tr√°nsitos.
- Interferometr√≠a de radio desde la red Very Long Baseline Array (VLBA) para afinar la distancia y par√°metros orbitales del sistema.

Fue el equipo del astr√≥nomo estadounidense Dr. Jonathan Kepler-Saunders quien confirm√≥, mediante el m√©todo de velocidad radial ultraestable, la existencia de cinco planetas principales y varios cinturones de escombros.

## 1.3 Descubrimiento revolucionario de la zona habitable circumbinaria

In [76]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    # ("##", "Header 2"),

]

text_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on,
    strip_headers =False
    )
corpus_mds = text_splitter.split_text(doc_md)


In [77]:
Markdown(corpus_mds[0].page_content)

# Un nuevo y fascinante vecino: Alth√©ra

In [78]:
Markdown(corpus_mds[1].page_content)

# √çndice  
1. Historia del descubrimiento
2. Conoce a Alth√©ra
3. Los soles de Alth√©ra
4. Estructura general de Alth√©ra
5. Planetas interiores
6. Planetas exteriores
7. Lunas y sat√©lites menores
8. Fen√≥menos destacados
9. Habitabilidad y astrobiolog√≠a
10. Conclusiones y perspectivas futuras

In [79]:
Markdown(corpus_mds[2].page_content)

# 1. Historia del descubrimiento  
## 1.1 Primeras observaciones y sospechas iniciales  
El sistema binario Alth√©ra ( HD 4579 AB ) fue detectado por primera vez en el a√±o 2032 durante una campa√±a de observaci√≥n del Observatorio Espacial James Webb , dirigida por la astrof√≠sica chilena Dra. Mariela Estay . La misi√≥n principal era estudiar la composici√≥n atmosf√©rica de exoplanetas candidatos a la habitabilidad, pero un patr√≥n an√≥malo en el flujo luminoso proveniente de la constelaci√≥n de Ori√≥n llam√≥ la atenci√≥n del equipo. El an√°lisis de curvas de luz revel√≥ oscilaciones peri√≥dicas dobles, un indicio claro de la presencia de dos estrellas en √≥rbita mutua y varios cuerpos orbitando de forma circumbinaria.  
## 1.2 Confirmaci√≥n mediante t√©cnicas combinadas  
En los meses siguientes, un consorcio internacional liderado por la Agencia Espacial Europea (ESA) y el Instituto Max Planck de Astronom√≠a despleg√≥ observaciones complementarias utilizando:  
- ELT (Extremely Large Telescope) en el Desierto de Atacama para espectroscop√≠a de alta resoluci√≥n.
- Telescopio Espacial Nancy Grace Roman para fotometr√≠a de gran precisi√≥n en tr√°nsitos.
- Interferometr√≠a de radio desde la red Very Long Baseline Array (VLBA) para afinar la distancia y par√°metros orbitales del sistema.  
Fue el equipo del astr√≥nomo estadounidense Dr. Jonathan Kepler-Saunders quien confirm√≥, mediante el m√©todo de velocidad radial ultraestable, la existencia de cinco planetas principales y varios cinturones de escombros.  
## 1.3 Descubrimiento revolucionario de la zona habitable circumbinaria  
El hallazgo m√°s impactante lleg√≥ en 2034, cuando la misi√≥n LUVOIR-B (Large UV/Optical/IR Surveyor) detect√≥ la firma espectral de vapor de agua, ox√≠geno molecular y metano en la atm√≥sfera de Aurelia III , un planeta ubicado en la zona habitable del sistema, orbitando a ambos soles. Este fue el primer caso documentado de un mundo potencialmente habitable en un sistema binario cercano -a tan solo 42,7 a√±os luz de la Tierra -, lo que lo convierte en un candidato ideal para futuras misiones de exploraci√≥n interestelar.  
## 1.4 Importancia cient√≠fica y proyecci√≥n futura  
El descubrimiento de Alth√©ra revolucion√≥ la astrobiolog√≠a y la f√≠sica orbital por tres razones clave:  
1. Din√°mica circumbinaria estable - demostr√≥ que los planetas pueden mantener √≥rbitas estables y climas equilibrados alrededor de dos soles, desafiando modelos anteriores.
2. Qu√≠mica atmosf√©rica compleja - Aurelia III presenta una mezcla de gases que, en equilibrio fotoqu√≠mico, sugieren procesos biol√≥gicos o geoqu√≠micos activos.
3. Proximidad relativa - su cercan√≠a permite observaciones directas en la pr√≥xima d√©cada con telescopios como el Habitable Worlds Observatory (HWO) y misiones de espectrometr√≠a directa de superficie como Starshot Spectra .  
En 2036, la Uni√≥n Astron√≥mica Internacional otorg√≥ a este hallazgo el Premio Messier de Descubrimiento Astron√≥mico y estableci√≥ el Programa Alth√©ra , un plan coordinado de investigaci√≥n que combina observaciones remotas, simulaciones clim√°ticas y dise√±o de futuras sondas interestelares.