In [None]:
!pip install --upgrade pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Downloading packagin

In [None]:
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Any
import math
import os
from openai import OpenAI

from pinecone import Pinecone, ServerlessSpec


## Loading chunks

In [None]:
chunks_path = "/content/chunks.jsonl"
chunks = []
with open(chunks_path, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

In [None]:
chunks[120]

{'id': 'ec8ff485-f8e7-4997-b7ff-5c4977cd622d',
 'text': '(CITATION) and (CITATION) . Our values are in average higher than the aforementioned relations during the L/T transition. Since (CITATION) uses also uses a sample of ultracool dwarfs extracted from the UltracoolSheet catalogue, we can directly compare our effective temperature determinations with their semi-empirical values. Figure illustrates this comparison, confirming a good consistency between the two sets and a deviation towards higher values in our temperatures for the L/T transition. This transition is still a less understood phase of ultracool dwarf evolution. The increase of cloud opacity from early-L to late-L dwarfs, and the evolution to cloudless T dwarfs, hugely complicates the modelling of these atmospheres. In the future, a better treatment of clouds for this transition in atmospheric models will be the key to mitigating this effect. The results obtained in this study indicate that the methodology presented by (CIT

# PINECONE

## Creating Pinecone index

In [None]:
pc = Pinecone(api_key=pinecone_key)

index_name = "thesis-chat"
dimension = 768  # all-mpnet-base-v2
metric = "cosine"

# Create index if it doesn't exist
if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,                  # "cosine" | "dotproduct" | "euclidean"
        spec=ServerlessSpec(
            cloud="aws",                # or "gcp", check your account availability
            region="us-east-1"          # pick a region close to your server
        )
    )

# Connect to the index
index = pc.Index(index_name)

In [None]:
pc.list_indexes()[0]

{
    "name": "thesis-chat",
    "metric": "cosine",
    "host": "thesis-chat-gs4mxea.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "enabled",
    "tags": null
}

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'v1': {'vector_count': 130}},
 'total_vector_count': 130,
 'vector_type': 'dense'}

## Upserting Embeddings in Pinecone index

In [None]:
# --- config ---
PINECONE_API_KEY = pinecone_key  # you already defined this
INDEX_NAME = "thesis-chat"
NAMESPACE = "multiling"                 # change if you want versioning
BATCH_SIZE = 200                 # Pinecone likes 100–500; tune as needed
EMBED_FILE = "/content/chunks_with_embeddings_multiling.jsonl"  # produced earlier
EXPECTED_DIM = 768               # all-mpnet-base-v2

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)

def iter_jsonl(path: str):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)

# (Optional) wipe the namespace first if you want a clean slate
# index.delete(delete_all=True, namespace=NAMESPACE)

In [None]:
# --- helpers to sanitize metadata ---
from typing import Any, Dict, List

def _clean_value(v: Any):
    # Drop None entirely
    if v is None:
        return None
    # Allowed scalars
    if isinstance(v, (str, int, float, bool)):
        return v
    # Lists must be list of strings
    if isinstance(v, list):
        out = [str(x) for x in v if x is not None]
        return out
    # Fallback: stringify (e.g., dicts)
    return str(v)

def sanitize_meta(meta: Dict[str, Any]) -> Dict[str, Any]:
    cleaned = {}
    for k, v in meta.items():
        cv = _clean_value(v)
        if cv is not None:
            cleaned[k] = cv
    return cleaned

# Optional: keep text metadata bounded (Pinecone metadata should be small)
MAX_TEXT_CHARS = 4000  # adjust if you like

def clip_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    return s if len(s) <= MAX_TEXT_CHARS else s[:MAX_TEXT_CHARS]

# --- batching / flush with sanitization + better error output ---
buffer = []
n = 0

def flush():
    global buffer, n
    if not buffer:
        return
    try:
        index.upsert(vectors=buffer, namespace=NAMESPACE)
    except Exception as e:
        # Try to pinpoint the first offending record
        print("Upsert failed; inspecting batch...")
        for rec in buffer:
            bad = {}
            for k, v in rec.get("metadata", {}).items():
                if v is None:
                    bad[k] = v
            if bad:
                print("Found None metadata fields in record id:", rec.get("id"), "->", bad)
                break
        raise  # re-raise after printing details
    n += len(buffer)
    print(f"Upserted {n} vectors in total...")
    buffer = []

# --- main loop (reads from your JSONL with embeddings) ---
for row in iter_jsonl(EMBED_FILE):
    emb = row.get("embedding")
    if not emb:
        raise ValueError("Row has no 'embedding'. Did you run the embedding step and save chunks_with_embeddings.jsonl?")
    if len(emb) != EXPECTED_DIM:
        raise ValueError(f"Embedding dim mismatch: got {len(emb)}, expected {EXPECTED_DIM}")

    vid = row.get("id") or f"chunk-{n}"

    raw_meta = {
        "text": clip_text(row.get("text", "")),
        "type": row.get("type"),
        "chapter_key": row.get("chapter_key"),
        "chapter": row.get("chapter"),
        "section_key": row.get("section_key"),
        "section": row.get("section"),
        "subsection_key": row.get("subsection_key"),
        "subsection": row.get("subsection"),
        "thesis_part": row.get("thesis_part"),
    }
    meta = sanitize_meta(raw_meta)

    buffer.append({
        "id": vid,
        "values": emb,      # list[float], length 768
        "metadata": meta,   # cleaned
    })

    if len(buffer) >= BATCH_SIZE:
        flush()

flush()  # send the final partial batch
print("Done.")

Upserted 130 vectors in total...
Done.


In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'multiling': {'vector_count': 130},
                'v1': {'vector_count': 130}},
 'total_vector_count': 260,
 'vector_type': 'dense'}

## LLM Test

In [None]:
# --- config ---
PINECONE_API_KEY = pinecone_key  # you already defined this
INDEX_NAME = "thesis-chat"
NAMESPACE = "multiling"                 # change if you want versioning

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # fast & good

query = "Cuáles son las mayores dificutlades a la hora de estimar la temperatura efectiva de las enanas ultrafrías?"
qvec = model.encode([query], convert_to_numpy=True)[0].tolist()

res = index.query(
    vector=qvec,
    top_k=50,
    include_metadata=True,
    namespace=NAMESPACE
)

pairs = [(query, m["metadata"].get("text","")) for m in res["matches"]]
scores = reranker.predict(pairs)
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

top = [res["matches"][i] for i in order[:10]]
# for i, m in enumerate(top, 1):
#     print(f"#{i}  score={scores[order[i-1]]:.3f}  ch={m['metadata'].get('chapter')} | sec={m['metadata'].get('section')} | type={m['metadata'].get('type')} .")
#     print(m["metadata"].get("text","")[:400], "…\n")

In [None]:
# === Build prompt from top passages and ask an LLM ===

# 1) Prepare numbered context blocks (dedupe + trim)
def as_path(md):
    parts = []
    if md.get("chapter_key") or md.get("chapter"):
        parts.append(f"Ch.{md.get('chapter_key') or ''}: {md.get('chapter') or ''}".strip())
    if md.get("section_key") or md.get("section"):
        parts.append(f"S.{md.get('section_key') or ''}: {md.get('section') or ''}".strip())
    if md.get("subsection_key") or md.get("subsection"):
        parts.append(f"SS.{md.get('subsection_key') or ''}: {md.get('subsection') or ''}".strip())
    if md.get("type"):
        parts.append(f"Text type: {md.get('type') or ''}".strip())
    return " | ".join([p for p in parts if p and not p.endswith(':')]).strip()

def trim(s: str, max_chars=1200):
    s = (s or "").strip()
    return s if len(s) <= max_chars else s[:max_chars] + " …"

# Deduplicate by vector id (if present)
seen = set()
contexts = []
for m in top:
    mid = m.get("id") or m.get("vector", {}).get("id") or id(m)
    if mid in seen:
        continue
    seen.add(mid)
    md = m.get("metadata", {}) or {}
    contexts.append({
        "text": trim(md.get("text","")),
        "path": as_path(md),
        "score": m.get("score", 0.0)
    })

# Keep the best K for generation
K = 6
contexts = contexts[:K]

numbered_blocks = []
for i, c in enumerate(contexts, 1):
    header = f"[[{i}]] {c['path']}" if c["path"] else f"[[{i}]]"
    numbered_blocks.append(f"{header}\n{c['text']}")

context_blob = "\n\n---\n\n".join(numbered_blocks)

# 2) Build the prompt
system_msg = (
    "You answer questions using ONLY the provided context blocks."
    "You answer questions in a extended way, don't be concise."
    "Cite the blocks you use by their bracket number like [1], [2]. "
    "If the answer is not contained in the context, say you don't know."
)
user_msg = f"Question: {query}\n\nContext:\n{context_blob}\n\n"

# 3) Call the LLM (OpenAI; you can swap in any provider)
client = OpenAI(api_key=OPENAI_API_KEY)

try:
    completion = client.chat.completions.create(
        model="gpt-5-mini",   # or "gpt-4o", "gpt-4.1-mini", etc.
        messages=[
            {"role":"system","content":system_msg},
            {"role":"user","content":user_msg}
        ]
    )
    answer = completion.choices[0].message.content

except Exception as e:
    answer = f"(LLM call failed: {e})"

print("=== Your question ===\n")
print(query)
print("\n=== Answer ===\n")
print(answer)
print("\n=== Sources used ===")
for i, c in enumerate(contexts, 1):
    print(f"[{i}] {c['path'] or '(no path)'}")

=== Your question ===

Cuáles son las mayores dificutlades a la hora de estimar la temperatura efectiva de las enanas ultrafrías?

=== Answer ===

A continuación describo, basándome en los fragmentos proporcionados, las principales dificultades a la hora de estimar la temperatura efectiva (Teff) de las enanas ultrafrías, con explicaciones detalladas y las referencias a los bloques de contexto usados.

1) Física atmosférica compleja — nubes y transición L/T
- La fase de transición L→T es “menos entendida” y complica mucho el modelado: el aumento de opacidad por nubes en las L tempranas y la evolución hacia enanas T sin nubes “complica enormemente” el modelado de estas atmósferas. Esto provoca desviaciones en las determinaciones de Teff en esa región y requiere una mejor descripción de las nubes en los modelos para mitigar el problema [2].  

2) Espectros dominados por fuertes bandas moleculares
- Las enanas ultrafrías, con Teff muy bajas, tienen espectros dominados por fuertes bandas de

ch_in_emb: Main challenges (from the provided context):

- Spectra dominated by strong molecular absorption and alkali lines, which complicates identification and modelling of temperature-sensitive features (TiO, VO disappearance; strong H2O, metal hydrides; growing Na I and K I lines). This makes spectral fitting and Teff diagnostics intrinsically complex. [3]

- The L/T transition produces a narrow range (a “plateau”) in effective temperature: Teff changes very little across the spectral/color transition, so spectral type or colour can be a poor Teff discriminator in that regime. [1]

- The “synthetic gap”: differences between synthetic model spectra and observed data hinder direct application of models for Teff estimation, requiring specialized methods (e.g., transfer learning) to bridge the gap. [5]

- Strong molecular absorbers (e.g., CH4 in T dwarfs) alter relative band fluxes (H and K suppressed versus J), causing non-monotonic colour–Teff behaviour and complicating photometric Teff estimates. [3][1]

- Instrumental/resolution and survey-specific effects (need to adapt methods developed for high-resolution spectra to low-resolution, wide-field surveys) introduce domain-specific challenges that must be accounted for in the Teff determination pipeline. [2]

If you want, I can expand on any of these points or list methods used to mitigate each challenge (based on the same context).

v1: The main challenges, as discussed in the provided material, are:

- Complex, molecular-dominated spectra: ultracool dwarf spectra are dominated by strong molecular absorption bands (H2O, CH4, metal hydrides) and evolving atomic lines (e.g., Na, K). These strong and temperature-dependent molecular features make it hard to isolate simple, monotonic spectral diagnostics of effective temperature. The spectral appearance changes substantially across M → L → T types (disappearance of TiO/VO, appearance/strengthening of H2O and CH4), complicating Teff inference from spectra or colours alone [3].

- The L/T transition temperature degeneracy: across the L → T transition the effective temperature evolves very slowly (a near-constant Teff “plateau”), while colours and spectral appearance change markedly. That narrow Teff range during the L/T transition produces degeneracies and makes Teff estimation particularly uncertain in that regime [1].

- Mismatch between synthetic and observed data ("synthetic gap"): differences between model (synthetic) spectra and real observed spectra limit the reliability of direct model fitting for Teff. Bridging this synthetic–observed gap is a key difficulty that motivates transfer-learning or empirical approaches [5].

- Limited information in low-resolution data: many wide surveys produce low-resolution near‑IR spectra or only photometry, which carry less detailed spectral information and require adapting high-resolution parameter‑estimation methods or developing specialized low‑resolution techniques to recover Teff reliably [2], [5].

- Need for multi-dataset/ancillary information and careful selection: robust Teff estimation often requires combining spectra with multi-band photometry, parallaxes, proper motions, or comparison to empirical templates/catalogues (i.e., assembling complementary data and applying tools/ML pipelines), adding practical complexity to the estimation process [6].

If you want, I can summarise how each challenge is addressed in the referenced work (methods used to mitigate them) with the same citations.