In [2]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

from langchain.document_loaders import PyPDFLoader, UnstructuredHTMLLoader
from langchain.schema import Document
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import os

In [3]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()
hf_token = os.getenv("HUGGINGFACE_API_KEY")
login(token= hf_token ) 

In [4]:
#! pip install unstructured

In [5]:
with open("./data/portugal_golden_visa_info.txt", encoding="utf-8") as f:
    content = f.read()

print("📄 文本长度:", len(content))
print("📄 前500字符:\n", content[:500])

📄 文本长度: 6417
📄 前500字符:
 === META INFORMATION ===
Title: Portugal Golden Visa: New Rules and Complete Guide 2025
Last Updated: 2025-06-29 00:19:34
Source URL: https://www.globalcitizensolutions.com/golden-visa-portugal/

=== KEY CATEGORIZED INFORMATION ===

** BENEFITS **
- Several benefits of the Portugal Investment Visa attract foreign investors to seek residency in Portugal. These benefits include:
- Free movement in the EU:As a Golden Visa holder, you can access 29 European Union countries. This offers unparalleled 


In [6]:
def ingest_all():
    base_path = "./data/"
    
    # 文件列表
    pdf_files = [
        "___Lei n.º 23_2007, de 04 de Julho.pdf",
        "ARI_alinea3_Pedido_PT-1.pdf",
        "ARI_alinea4_Pedido_PT-1.pdf",
        "ARI_alinea7_Pedido_PT-1.pdf",
        "ARI_-subalinea1-2022.pdf",
        "ARI_-subalinea2-2022.pdf",
        "ARI_-subalinea5-2022.pdf",
        "ARI_-subalinea6-2022.pdf",
        "ARI_-subalinea8-2022.pdf",
        "ARI_ReagrupamentoFamiliar-2022.pdf"
    ]
    
    txt_files = ["portugal_golden_visa_info.txt"]
    html_files = ["Arquivo.pt - preservada pelo Arquivo.pt.html"]

    all_docs = []

    # 1. 读取 PDF，并添加 source metadata
    for pdf in pdf_files:
        path = os.path.join(base_path, pdf)
        loader = PyPDFLoader(path)
        docs = loader.load_and_split()
        for doc in docs:
            doc.metadata["source"] = pdf
        all_docs.extend(docs)

    # 2. 读取 TXT 文本文件，直接作为长文 Document
    for txt_file in txt_files:
        path = os.path.join(base_path, txt_file)
        try:
            with open(path, encoding="utf-8") as f:
                content = f.read()
            all_docs.append(Document(page_content=content, metadata={"source": txt_file}))
        except Exception as e:
            print(f"⚠️ 读取 TXT 文件出错: {txt_file}, 错误: {e}")

    # 3. 读取 HTML 文件
    for html_file in html_files:
        path = os.path.join(base_path, html_file)
        loader = UnstructuredHTMLLoader(path)
        docs = loader.load_and_split()
        for doc in docs:
            doc.metadata["source"] = html_file
        all_docs.extend(docs)

    print(f"📄 Loaded total {len(all_docs)} documents.")

    # 4. 拆分所有文档为 chunk
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(all_docs)
    print(f"🔍 Split into {len(chunks)} chunks.")

    # 5. 嵌入 & 向量数据库构建
    embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embedding,
        persist_directory="./goldenvisa_chroma_db"
    )
    vector_store.persist()
    print("✅ Ingest finished. Vector store saved at ./goldenvisa_chroma_db.")


In [7]:
# only run this once to generate vector store
ingest_all()

📄 Loaded total 187 documents.
🔍 Split into 771 chunks.


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


✅ Ingest finished. Vector store saved at ./goldenvisa_chroma_db.


  warn_deprecated(


In [6]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_core.callbacks import StdOutCallbackHandler
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.embeddings import HuggingFaceEmbeddings

In [7]:
# 构造 Prompt 模板
def build_prompt():
    return PromptTemplate.from_template(
        """
<Instructions>
You are a legal assistant specialized in immigration policies. Use only the context below to answer the question.
If the answer is not in the context, say: "No context available for this question."

Answer must include:
1. Summary of the answer in one sentence.
2. Direct quote(s) from the source document(s), if available.
3. A final conclusion in your own words.
Format your answer using clear sections: Summary, Quotes, Conclusion.
</Instructions>

<Example>
Question: 黄金签证是否允许申请人家属一同移民？
Context: 根据葡萄牙法律第23/2007号第98条第2款，申请人可以携带其配偶、未成年子女以及经济依赖的家庭成员一同申请。

Answer:
Summary:
Yes, family members can accompany the applicant.

Quotes:
- “...可以携带其配偶、未成年子女以及经济依赖的家庭成员一同申请。”

Conclusion:
The law explicitly allows family reunification under the golden visa, so applicants can include family members.
</Example>

<Example>
Question: 是否必须在葡萄牙长期居住才能保持黄金签证资格？
Context: 申请人在持有黄金签证期间，每年只需在葡萄牙境内停留7天即可维持其居留资格。

Answer:
Summary:
No, long-term residence is not required.

Quotes:
- “每年只需在葡萄牙境内停留7天即可维持其居留资格。”

Conclusion:
The golden visa program offers flexible residency requirements, making it suitable for investors who travel frequently.
</Example>

<Input>
Question: {input}
Context: {context}
Answer:
</Input>
        """
    )


# 加载向量数据库
def load_vector_store():
    embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
    return Chroma(
        persist_directory="./goldenvisa_chroma_db",
        embedding_function=embedding
    )


# 构建 Retriever（设置 k 和阈值）
def build_retriever(vector_store):
    return vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 2,
            "score_threshold": 0.1,
        },
    )

# 提取 source 信息
def format_answer_with_sources(answer: str, docs: list[Document]) -> str:
    if not docs:
        return f"{answer.strip()}\n\n📚 References:\n- No relevant source documents found."
    
    sources = "\n".join([
        f"- Source: {doc.metadata.get('source', 'N/A')}"
        for doc in docs
    ])
    return f"{answer.strip()}\n\n📚 References:\n{sources}"


# 构建带输出 & source 的增强型 RAG chain
def rag_chain():
    # 1. Load LLM
    model = ChatOllama(model="mistral")

    # 2. Prompt
    prompt = build_prompt()

    # 3. Vector store
    vector_store = load_vector_store()

    # 4. Retriever
    retriever = build_retriever(vector_store)

    # 5. Stuff chain
    document_chain = create_stuff_documents_chain(model, prompt)

    # 6. Retrieval chain
    chain = create_retrieval_chain(retriever, document_chain)

    # 7. 包装为带来源输出的执行函数
    def run_with_sources(user_input: str):
        result = chain.invoke(
            {"input": user_input},
            config={"callbacks": [StdOutCallbackHandler()]}
        )
        # 输出中包括 retrieved 文档（用于引用）
        answer = result["answer"]
        docs = result["context"]
        return format_answer_with_sources(answer, docs)

    return run_with_sources


In [8]:
qa_chain = rag_chain()
response = qa_chain("葡萄牙黄金签证是否允许全家移民？")
print(response)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given




[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableAssign<context> chain...[0m


[1m> Entering new RunnableParallel<context> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableAssign<answer> chain...[0m


[1m> Entering new RunnableParallel<answer> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableAssign<context> chain...[0m


[1m> Entering new RunnableParallel<context> chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new PromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new StrOutputParser chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m
Summary:
Yes, family members can accompany the applicant, but it is not explicitly stated that all family members can move permanently with the applicant.

Quotes:
- "The Portugal Golden Visa offers flexibility in residence. The requirements a

In [9]:
qa_chain = rag_chain()
response = qa_chain("Does the Portuguese golden visa allow family reunification or family immigration?")
print(response)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given




[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableAssign<context> chain...[0m


[1m> Entering new RunnableParallel<context> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new RunnableAssign<answer> chain...[0m


[1m> Entering new RunnableParallel<answer> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableAssign<context> chain...[0m


[1m> Entering new RunnableParallel<context> chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new PromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new StrOutputParser chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

