In [1]:
import sys
import os
from dotenv import load_dotenv

# 添加 backend 路径
current_dir = os.getcwd()
if 'notebooks' in current_dir:
    backend_path = os.path.abspath(os.path.join(current_dir, '..', 'backend'))
    env_path = os.path.abspath(os.path.join(current_dir, '..', 'backend', '.env'))
else:
    backend_path = os.path.abspath(os.path.join(current_dir, 'backend'))
    env_path = os.path.abspath(os.path.join(current_dir, 'backend', '.env'))

if backend_path not in sys.path:
    sys.path.append(backend_path)

load_dotenv(env_path)
print(f"Backend path added: {backend_path}")

Backend path added: d:\My Data\Rag\rag-project01-framework\backend


In [2]:
from services.vector_store_service import VectorStoreService, VectorDBConfig
from services.search_service import SearchService
from utils.config import VectorDBProvider
import os

vector_store_service = VectorStoreService()
search_service = SearchService()
print("Services initialized.")

  from pkg_resources import DistributionNotFound, get_distribution


Services initialized.


In [3]:
# 列出当前的 Chroma 集合
collections = vector_store_service.list_collections(provider=VectorDBProvider.CHROMA)
print(f"Current Chroma collections: {collections}")

Current Chroma collections: ['074_ollama_20251217153128', '074_ollama_20251217153318', '074_ollama_20251217151221', '074_bedrock_20251217150304', 'DeepSeek_R1_jishubaogaozhongwenban_ollama_20251217155428', '074_ollama_20251217152903', '074_ollama_20251217152717', 'DeepSeek_R1_jishubaogaozhongwenban_ollama_20251217154928']


In [4]:
# 选择一个现有的 embedded 文档进行索引
# 这里我们查找 backend/02-embedded-docs 目录下的第一个 json 文件
embedded_docs_dir = os.path.join(backend_path, "02-embedded-docs")
embedded_files = [f for f in os.listdir(embedded_docs_dir) if f.endswith('.json')]

if not embedded_files:
    print("No embedded documents found. Please run homework_1_embedding.ipynb first to generate embeddings.")
else:
    # 指定要索引的目标文件
    target_filename = "DeepSeek-R1-技术报告中文版_huggingface_20250312225906.json"
    file_path = os.path.join(embedded_docs_dir, target_filename)
    
    if os.path.exists(file_path):
        print(f"Using specified file: {target_filename}")
        
        import json
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        print("Re-embedding content using local Ollama model (to ensure dimension match)...")
        # 提取文本内容
        chunks = []
        for item in data["embeddings"]:
            # 构造原始 chunk 结构
            chunk = {
                "content": item["metadata"]["content"],
                "metadata": {
                    "chunk_id": item["metadata"]["chunk_id"],
                    "page_number": item["metadata"]["page_number"],
                    "page_range": item["metadata"]["page_range"],
                    "word_count": item["metadata"]["word_count"]
                }
            }
            chunks.append(chunk)
            
        # 重新生成 Embedding
        from services.embedding_service import EmbeddingService, EmbeddingConfig, EmbeddingProvider
        from datetime import datetime
        
        # 临时实例化一个服务来处理生成
        temp_embedding_service = EmbeddingService()
        
        local_model = "nomic-embed-text"
        config = EmbeddingConfig(
            provider=EmbeddingProvider.OLLAMA,
            model_name=local_model
        )
        
        input_data = {
            "chunks": chunks,
            "metadata": {"filename": data.get("filename", "unknown.pdf")}
        }
        
        try:
            print(f"Generating new embeddings for {len(chunks)} chunks using {local_model}...")
            new_embeddings, _ = temp_embedding_service.create_embeddings(input_data, config)
            
            # 保存到临时文件
            import tempfile
            temp_dir = os.path.dirname(file_path)
            temp_file_path = os.path.join(temp_dir, "temp_ollama_reembedded.json")
            
            # 构造符合 save_embeddings 输出格式的数据
            output_data = {
                "filename": data.get("filename", "unknown.pdf"),
                "created_at": datetime.now().isoformat(),
                "embedding_provider": "ollama",
                "embedding_model": local_model,
                "vector_dimension": len(new_embeddings[0]["embedding"]),
                "embeddings": new_embeddings
            }
            
            with open(temp_file_path, 'w', encoding='utf-8') as f:
                json.dump(output_data, f, ensure_ascii=False, indent=2)
                
            target_file = "temp_ollama_reembedded.json"
            file_path = temp_file_path
            print(f"Created re-embedded temp file: {target_file}")
            
        except Exception as e:
            print(f"Re-embedding failed: {e}")
            print("Please ensure Ollama is running and model is pulled.")
            raise e
        
    else:
        print(f"Warning: Specified file {target_filename} not found!")
        # Fallback logic
        if embedded_files:
             target_file = embedded_files[0]
             file_path = os.path.join(embedded_docs_dir, target_file)
             print(f"Falling back to first available file: {target_file}")
    
    print(f"Indexing file: {target_file}")
    
    # 配置使用 Chroma
    config = VectorDBConfig(
        provider=VectorDBProvider.CHROMA,
        index_mode="flat"  # Chroma 默认索引，这里参数可能不直接影响 Chroma 但需要保持接口一致
    )
    
    try:
        result = vector_store_service.index_embeddings(file_path, config)
        print("Indexing result:", result)
        indexed_collection_name = result["collection_name"]
    except Exception as e:
        print(f"Indexing failed: {e}")
        import traceback
        traceback.print_exc()

Using specified file: DeepSeek-R1-技术报告中文版_huggingface_20250312225906.json
Re-embedding content using local Ollama model (to ensure dimension match)...
Generating new embeddings for 5 chunks using nomic-embed-text...


  return OllamaEmbeddings(


Created re-embedded temp file: temp_ollama_reembedded.json
Indexing file: temp_ollama_reembedded.json
Indexing result: {'database': <VectorDBProvider.CHROMA: 'chroma'>, 'index_mode': 'flat', 'total_vectors': 5, 'index_size': 5, 'processing_time': 0.062643, 'collection_name': 'DeepSeek_R1_jishubaogaozhongwenban_ollama_20251217155922'}


In [None]:
# 使用 Chroma 进行搜索
if 'indexed_collection_name' in locals():
    query = "DeepSeek"
    print(f"Searching for '{query}' in collection '{indexed_collection_name}' using Chroma...")
    
    try:
        results = await search_service.search(
            query=query,
            collection_id=indexed_collection_name,
            provider=VectorDBProvider.CHROMA,
            top_k=3,
            threshold=0.3 
        )
        
        print(f"Search Results (Threshold=0.1):")
        if not results["results"]:
             print("No results found. The similarity score might be too low.")
             
        for res in results["results"]:
            print(f"- Score: {res['score']:.4f}")
            print(f"  Text: {res['text'][:100]}...")
            print(f"  Metadata: {res['metadata']}")
            
    except Exception as e:
        print(f"Search failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("Skipping search as indexing failed or no collection created.")

Searching for 'DeepSeek' in collection 'DeepSeek_R1_jishubaogaozhongwenban_ollama_20251217155922' using Chroma...
Search Results (Threshold=0.1):
- Score: 0.5666
  Text: • 其他方面： DeepSeek-R1 在多种任务中也表现出色，包括创意写作、通用问答、编辑、摘
要等。它在  AlpacaEval 2.0 上实现了  87.6% 的长度控制胜率，在  Are-na...
  Metadata: {'source': 'DeepSeek-R1-技术报告中文版.pdf', 'page': '5', 'chunk': 5, 'total_chunks': 5, 'page_range': '5', 'embedding_provider': 'ollama', 'embedding_model': 'nomic-embed-text', 'embedding_timestamp': '2025-12-17T15:59:22.229439'}
- Score: 0.5505
  Text: 1.1. 贡献
训练后：在基础模型上进行大规模强化学习
• 我们直接将强化学习（ RL ）应用于基础模型，而不依赖于监督微调（ SFT ）作为初步步骤
。这种方法使模型能够探索思维链（ CoT ）以解...
  Metadata: {'source': 'DeepSeek-R1-技术报告中文版.pdf', 'page': '4', 'chunk': 4, 'total_chunks': 5, 'page_range': '4', 'embedding_provider': 'ollama', 'embedding_model': 'nomic-embed-text', 'embedding_timestamp': '2025-12-17T15:59:19.141814'}
- Score: 0.5222
  Text: 目录
1 引言 3
1.1 贡献 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4   1