In [3]:
import sys
sys.path.append("..")  # so Python can see project root

from dotenv import load_dotenv
import os

# Load .env file from the project root
load_dotenv("../.env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

OPENAI_API_KEY[:4], PINECONE_API_KEY[:4] + "..."


('sk-p', 'pcsk...')

In [19]:
import re
import unicodedata

def make_pinecone_id(raw: str, max_len: int = 64) -> str:
    """
    Turn an arbitrary string into a safe Pinecone ID:
    - Normalize unicode
    - Strip non-ASCII
    - Keep only letters, digits, hyphen, underscore
    - Collapse repeats and trim length
    """
    # Normalize and remove non-ASCII
    raw_norm = unicodedata.normalize("NFKD", raw)
    raw_ascii = raw_norm.encode("ascii", "ignore").decode("ascii")

    # Replace any non [a-zA-Z0-9_-] with hyphens
    safe = re.sub(r"[^a-zA-Z0-9_-]+", "-", raw_ascii)

    # Collapse multiple hyphens and strip edges
    safe = re.sub(r"-{2,}", "-", safe).strip("-")

    # Lowercase and truncate
    safe = safe.lower()[:max_len] or "id"

    return safe

In [20]:
make_pinecone_id("Add a Multi-Question Page to your form – Help Center")


'add-a-multi-question-page-to-your-form-help-center'

In [6]:
from pathlib import Path

def load_local_html_docs(root_dir: str = "../data/raw"):
    docs = []
    for path in Path(root_dir).glob("*.html"):
        with path.open("r", encoding="utf-8") as f:
            docs.append(
                {
                    "id": path.stem,
                    "source_path": str(path),
                    "html": f.read(),
                }
            )
    return docs

html_docs = load_local_html_docs()

In [7]:
len(html_docs)

2

In [21]:
from bs4 import BeautifulSoup

STOP_HEADINGS = {
    "was this article helpful?",
    "related articles",
}

def extract_article_from_html(html: str, source_path: str):
    soup = BeautifulSoup(html, "html.parser")
    
    main = soup.find("main") or soup.body or soup
    title_tag = main.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else source_path
    
    content_parts = []
    if title_tag:
        for el in title_tag.find_all_next():
            if el.name in ("h2", "h3", "h4"):
                heading_text = el.get_text(" ", strip=True).lower()
                if any(stop in heading_text for stop in STOP_HEADINGS):
                    break

            if el.name in ("p", "h2", "h3", "li"):
                text = el.get_text(" ", strip=True)
                if not text:
                    continue
                if el.name in ("h2", "h3"):
                    level = 2 if el.name == "h2" else 3
                    content_parts.append("#" * level + " " + text)
                elif el.name == "li":
                    content_parts.append(f"- {text}")
                else:
                    content_parts.append(text)
    
    content = "\n\n".join(content_parts)
    raw_id = Path(source_path).stem  # or title, or URL segment
    article_id = make_pinecone_id(raw_id)


    return {
        "id": article_id,
        "title": title,
        "content": content,
        "metadata": {
            "source": "typeform_help_center_snapshot",
        },
    }

articles = [
    extract_article_from_html(doc["html"], doc["source_path"])
    for doc in html_docs
]

In [22]:
articles

[{'id': 'add-a-multi-question-page-to-your-form-help-center',
  'title': 'Add a Multi-Question Page to your form',
  'content': "Traditionally, our forms have been designed to show one question at a time. This makes them feel more conversational and increases engagement. But sometimes, you might want to streamline your form by asking multiple questions on the same page. By adding a Multi-Question Page to your form, you can do just that.\n\nWhen you add a Multi-Question page to your form, here's how questions will be displayed to your respondents.\n\n## Supported question types\n\n- Checkbox\n\n- Dropdown\n\n- Email\n\n- Legal\n\n- Long Text\n\n- Multiple Choice\n\n- NPS\n\n- Opinion Scale\n\n- Phone Number\n\n- Ranking\n\n- Short Text\n\n- Statement\n\n- Yes/No\n\n## How to add a Multi-Question Page to your form\n\n1. Go to the form you want to edit and click + Add content .\n\n2. Select Multi-Question Page from the question menu.\n\n3. A new page will be added to your form where you c

In [23]:
from typing import List

def chunk_text(
    text: str,
    max_chars: int = 1200,
    overlap: int = 200,
) -> List[str]:
    """
    Simple character-based chunker with overlap.
    - max_chars: max characters per chunk
    - overlap: number of characters to overlap between chunks
    """
    if not text:
        return []

    # safety: avoid non-progress situation
    if overlap >= max_chars:
        overlap = max_chars // 4  # just in case

    chunks = []
    n = len(text)
    start = 0

    while start < n:
        end = min(n, start + max_chars)
        chunk = text[start:end]
        chunks.append(chunk)

        if end == n:
            break  # reached the end

        # move start forward with overlap
        start = end - overlap

        # extra safety: if we somehow don't move, break
        if start <= 0 and end == n:
            break

    return chunks


In [24]:
from typing import Dict, List

chunks: List[Dict] = []

for art in articles:
    article_id = art["id"]
    article_title = art["title"]
    article_content = art["content"]
    article_meta = art["metadata"]

    article_chunks = chunk_text(article_content, max_chars=1200, overlap=200)

    for i, chunk_text_ in enumerate(article_chunks):
        chunks.append(
            {
                "id": f"{article_id}::chunk-{i}",
                "text": chunk_text_,
                "metadata": {
                    **article_meta,
                    "article_id": article_id,
                    "title": article_title,
                    "chunk_index": i,
                },
            }
        )

print(f"Created {len(chunks)} chunks from {len(articles)} articles.")


Created 14 chunks from 2 articles.


In [25]:
print("Example chunk length:", len(chunks[0]["text"]))

Example chunk length: 1200


In [26]:
from openai import OpenAI

client = OpenAI()  # expects OPENAI_API_KEY in env


def embed_texts(texts: List[str]) -> List[List[float]]:
    resp = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts,
    )
    return [d.embedding for d in resp.data]

chunk_texts = [c["text"] for c in chunks]
chunk_vectors = embed_texts(chunk_texts)

for chunk, vec in zip(chunks, chunk_vectors):
    chunk["embedding"] = vec

print(f"Embedded {len(chunks)} chunks.")


Embedded 14 chunks.


In [27]:
chunks

[{'id': 'add-a-multi-question-page-to-your-form-help-center::chunk-0',
  'text': "Traditionally, our forms have been designed to show one question at a time. This makes them feel more conversational and increases engagement. But sometimes, you might want to streamline your form by asking multiple questions on the same page. By adding a Multi-Question Page to your form, you can do just that.\n\nWhen you add a Multi-Question page to your form, here's how questions will be displayed to your respondents.\n\n## Supported question types\n\n- Checkbox\n\n- Dropdown\n\n- Email\n\n- Legal\n\n- Long Text\n\n- Multiple Choice\n\n- NPS\n\n- Opinion Scale\n\n- Phone Number\n\n- Ranking\n\n- Short Text\n\n- Statement\n\n- Yes/No\n\n## How to add a Multi-Question Page to your form\n\n1. Go to the form you want to edit and click + Add content .\n\n2. Select Multi-Question Page from the question menu.\n\n3. A new page will be added to your form where you can add as many questions as you like. To begin 

In [16]:
import os
from pinecone import Pinecone, ServerlessSpec

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "typeform-helpcenter")

pc = Pinecone(api_key=PINECONE_API_KEY)

# Create the index once (idempotent if you rerun)
existing_indexes = [idx.name for idx in pc.list_indexes()]
if INDEX_NAME not in existing_indexes:
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,          # text-embedding-3-small
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

index = pc.Index(INDEX_NAME)
index


<pinecone.data.index.Index at 0x7fce2c685a00>

In [28]:
to_upsert = []

for c in chunks:
    to_upsert.append(
        {
            "id": c["id"],
            "values": c["embedding"],
            "metadata": {
                "text": c["text"],  # store chunk text for convenience
                **c["metadata"],
            },
        }
    )

len(to_upsert)

14

In [29]:
upsert_response = index.upsert(vectors=to_upsert)
upsert_response

{'upserted_count': 14}

In [30]:
def embed_query(query: str):
    resp = client.embeddings.create(
        model="text-embedding-3-small",
        input=query,
    )
    return resp.data[0].embedding

In [33]:
def retrieve(query: str, top_k: int = 5):
    q_vec = embed_query(query)
    res = index.query(
        vector=q_vec,
        top_k=top_k,
        include_metadata=True,
    )
    return res["matches"]

matches = retrieve("How do I create a multi language form?", top_k=5) #("What kind of question types are supported?", top_k=5)
len(matches), matches[0]

(5,
 {'id': 'create-multi-language-forms-help-center::chunk-0',
  'metadata': {'article_id': 'create-multi-language-forms-help-center',
               'chunk_index': 0.0,
               'source': 'typeform_help_center_snapshot',
               'text': 'Save time and reach a wider audience by creating a '
                       "single form with multiple language options. You'll have "
                       'the option to write your own translations or have AI '
                       'translate your form for you.\xa0 Depending on your '
                       'language settings, you can decide if you want to '
                       'automatically have your form translated for respondents '
                       'or give them the option to translate the form. The form '
                       "will be translated if the respondent's browser language "
                       "matches one of the translations you've added. All "
                       'responses will be displayed in one 

In [35]:
def build_context_from_matches(matches):
    parts = []
    for i, m in enumerate(matches):
        md = m["metadata"]
        chunk_text = md.get("text", "")
        title = md.get("title", "")
        parts.append(
            f"[{i}] Title: {title}\nChunk ID: {m['id']}\nContent:\n{chunk_text}"
        )
    return "\n\n---\n\n".join(parts)


In [36]:
def answer_with_rag(query: str, top_k: int = 5) -> str:
    matches = retrieve(query, top_k=top_k)
    if not matches:
        return "I couldn't find any relevant information in the help center."

    context = build_context_from_matches(matches)

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful Typeform support assistant. "
                "Use ONLY the provided context to answer the user's question. "
                "If the answer is not in the context, say you don't know."
            ),
        },
        {
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion: {query}",
        },
    ]

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.1,
    )
    return resp.choices[0].message.content


In [37]:
answer = answer_with_rag("How can I create a multi language form in Typeform?", top_k=5)
print(answer)

To create a multi-language form in Typeform, follow these steps:

1. **Open your form** and click the **Translations icon**.
2. Click **+ Add languages**.
3. **Select the languages** you'd like to add to your form, then click **Add** when you're done.
4. You'll see the status **Translation needed** for the languages you selected. Hover over the language and choose how you'd like to translate your form:
   - You can **Translate with AI** or **Download a template**.
5. If you choose to **Translate with AI**, the status will change to **Translated** once the translations are provided. You can edit these translations if needed.

Make sure you have a **Business, Talent, Growth Pro, Growth Custom, or Enterprise plan** to create multi-language forms. It's also recommended to finalize the text in your form before adding translations to avoid mismatches.


In [38]:
answer = answer_with_rag("Where can you spot a manta ray?", top_k=5)
print(answer)

I don't know.


In [39]:
answer = answer_with_rag("What kind of question types are supported?", top_k=5)
print(answer)

The supported question types for a Multi-Question Page are:

- Checkbox
- Dropdown
- Email
- Legal
- Long Text
- Multiple Choice
- NPS
- Opinion Scale
- Phone Number
- Ranking
- Short Text
- Statement
- Yes/No


In [46]:
import sys
sys.path.append("..")

from dotenv import load_dotenv
load_dotenv("../.env")

from app.ingest import run_ingestion_once
from app.rag import answer_with_rag, retrieve

ImportError: cannot import name 'run_ingestion_once' from 'app.ingest' (../app/ingest.py)