In [1]:
pip install -U langgraph


Collecting langgraph
  Downloading langgraph-0.2.23-py3-none-any.whl.metadata (13 kB)
Downloading langgraph-0.2.23-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.8/104.8 kB[0m [31m926.7 kB/s[0m eta [36m0:00:00[0m0:01[0m:00:01[0m
[?25hInstalling collected packages: langgraph
  Attempting uninstall: langgraph
    Found existing installation: langgraph 0.2.19
    Uninstalling langgraph-0.2.19:
      Successfully uninstalled langgraph-0.2.19
Successfully installed langgraph-0.2.23
Note: you may need to restart the kernel to use updated packages.


In [5]:
!conda install nmslib

^C

CondaError: KeyboardInterrupt



In [None]:
!pip install pyserini
!pip install faiss-cpu
!pip install torch
!pip install sentence_transformers

In [2]:
from typing import Optional

import numpy as np
from pyserini.encode import DocumentEncoder
from sklearn.preprocessing import normalize
from transformers import AutoModel, AutoTokenizer


class LukeDocumentEncoder(DocumentEncoder):
    def __init__(
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
        device: str = "cpu",
        l2_norm=False,
    ):
        self.device = device
        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(self.device)
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_name if tokenizer_name else model_name, use_fast=True
            )
        except:
            self.tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_name if tokenizer_name else model_name, use_fast=False
            )

        self.has_model = True
        self.l2_norm = l2_norm

    def encode(self, texts, span, max_length=256, **kwargs) -> np.ndarray:
        """Lukeを用いて入力Entityのベクトル化を行う"""
        tokenizer_kwargs = {
            "max_length": max_length,
            "truncation": True,
            "padding": "longest",
            "return_tensors": "pt",
        }

        inputs = self.tokenizer(text=texts, entity_spans=span, **tokenizer_kwargs)
        inputs.to(self.device)

        outputs = self.model(**inputs)
        entity_vector = outputs.entity_last_hidden_state.detach().cpu().numpy()
        batch_size = entity_vector.shape[0]
        entity_vector = entity_vector.reshape([batch_size, -1])
        if self.l2_norm:
            entity_vector = normalize(entity_vector, norm="l2", axis=1)
        return entity_vector

In [3]:
from pyserini.encode import FaissRepresentationWriter

# from transformers import AutoModel
encoder = LukeDocumentEncoder("studio-ousia/luke-japanese-base-lite")
# encoder = AutoModel.from_pretrained("bandainamco-mirai/distilbert-base-japanese")

embedding_writer = FaissRepresentationWriter("output/encoded", dimension=768)

# batch size 1 のサンプルデータ
data = {
    "start": [0],
    "end": [3],
    "contexts": ["渋谷区（しぶやく）は、東京都の区部南西部に位置する特別区。"],
    "entity-name": ["渋谷区"],
    "id": ["Q193638"],
}

with embedding_writer:
    spans = [[(start, end)] for start, end in zip(data["start"], data["end"])]
    kwargs = {
        "texts": data["contexts"],
        "span": spans,
    }
    embeddings = encoder.encode(**kwargs)
    data["vector"] = embeddings
    embedding_writer.write(data)

AttributeError: 'DistilBertModel' object has no attribute 'encode'

: 

In [None]:
!python -m pyserini.index.faiss \
  --input /output/encoded \
  --output /output/index \
  --hnsw

In [None]:
from typing import List, Tuple, Union

import faiss
import numpy as np
from pyserini.search import DenseSearchResult, FaissSearcher, PRFDenseSearchResult


class LukeFaissSearcher(FaissSearcher):
    """Faiss searcher for Luke.

    This code is based on the following code:
    https://github.com/castorini/pyserini/blob/b56d04a823d8fd063614524dec799ef84db0cac1/pyserini/search/faiss/_searcher.py#L379
    """

    def search(
        self,
        query: str,
        span: List[Tuple[int, int]],
        k: int = 10,
        threads: int = 1,
        return_vector: bool = False,
    ) -> Union[List[DenseSearchResult], Tuple[np.ndarray, List[PRFDenseSearchResult]]]:
        emb_q = self.query_encoder.encode(query, span)
        assert len(emb_q) == self.dimension
        emb_q = emb_q.reshape((1, len(emb_q)))
        faiss.omp_set_num_threads(threads)

        if return_vector:
            distances, indexes, vectors = self.index.search_and_reconstruct(emb_q, k)
            vectors = vectors[0]
            distances = distances.flat
            indexes = indexes.flat
            return emb_q, [
                PRFDenseSearchResult(self.docids[idx], score, vector)
                for score, idx, vector in zip(distances, indexes, vectors)
                if idx != -1
            ]
        else:
            distances, indexes = self.index.search(emb_q, k)
            distances = distances.flat
            indexes = indexes.flat
            return [
                DenseSearchResult(self.docids[idx], score)
                for score, idx in zip(distances, indexes)
                if idx != -1
            ]

In [None]:
from typing import Optional

import numpy as np
from pyserini.search import QueryEncoder


class LukeQueryEncoder(LukeDocumentEncoder, QueryEncoder):
    def __init__(
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
        device: str = "cuda:0",
        l2_norm=False,
    ):
        super().__init__(model_name, tokenizer_name, device, l2_norm)

    def encode(self, texts, span, max_length=256, **kwargs) -> np.ndarray:
        entity_vector = super().encode(texts, span, max_length, **kwargs)
        return entity_vector.flatten()

In [None]:
encoder = LukeQueryEncoder("studio-ousia/luke-japanese-base-lite")
searcher = LukeFaissSearcher("/path/to/output/index", encoder)

query_mention = {
    "mention": "屋久島",
    "context": "同9時30分までの1時間に90mm、降り始めからの合計が319mmとなる豪雨を記録するなど、種子島や屋久島は局地的な豪雨となった。\n\n一方気象庁は22日、東日本と西日本を中心に5月の連休明けからの日照時間が",  # 今回はmentionの周囲50文字を取り出しています
    "start": 50,
    "end": 53,
}
kwargs = {
    "query": query_mention["context"],
    "span": [[(query_mention["start"], query_mention["end"])]],
}
results = seacher.search(**kwargs)

for i in range(0, 10):  # top-10
    entity_id = results[i].docid
    print(f"{i+1:2} {entity_id} {results[i].score:.5f}")

In [1]:
import json

import faiss
import numpy as np
from pyserini.index.lucene import LuceneIndexer
from pyserini.search.faiss import FaissSearcher
from pyserini.search.hybrid import HybridSearcher
from pyserini.search.lucene import LuceneSearcher
from sentence_transformers import SentenceTransformer

# サンプルデータ
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast orange fox leaps above a sleepy canine.",
    "The lazy dog sleeps all day long.",
    "Foxes are known for their agility and speed.",
]


# JSONLファイルの作成
jsonl_file = "sample_docs.jsonl"
with open(jsonl_file, "w") as f:
    for i, doc in enumerate(documents):
        json.dump({"id": str(i), "contents": doc}, f)
        f.write("\n")

# インデックスの作成
index_dir = "sample_index"
indexer = LuceneIndexer(index_dir)
indexer.index(jsonl_file)

# BM25検索器の初期化
searcher = LuceneSearcher(index_dir)

# 密ベクトル検索のためのインデックス作成
model = SentenceTransformer("all-MiniLM-L6-v2")  # 軽量なモデルを使用
embeddings = model.encode(documents)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings).astype("float32"))

# FAISSSearcherの初期化
dsearcher = FaissSearcher(index, model)

# ハイブリッド検索器の作成
hsearcher = HybridSearcher(dsearcher, searcher)

# 検索の実行
query = "fast fox"
hits = hsearcher.search(query)

# 結果の表示
for i, hit in enumerate(hits):
    print(f"{i+1:2} {hit.docid:4} {hit.score:.5f} {documents[int(hit.docid)]}")

  from .autonotebook import tqdm as notebook_tqdm
9月 21, 2024 9:15:18 午前 org.apache.lucene.store.MMapDirectory lookupProvider
警告: You are running with Java 22 or later. To make full use of MMapDirectory, please update Apache Lucene.


2024-09-21 09:15:18,429 INFO  [Thread-0] index.SimpleIndexer (SimpleIndexer.java:138) - Using DefaultEnglishAnalyzer
2024-09-21 09:15:18,439 INFO  [Thread-0] index.SimpleIndexer (SimpleIndexer.java:139) - Stemmer: porter
2024-09-21 09:15:18,439 INFO  [Thread-0] index.SimpleIndexer (SimpleIndexer.java:140) - Keep stopwords? false
2024-09-21 09:15:18,440 INFO  [Thread-0] index.SimpleIndexer (SimpleIndexer.java:141) - Stopwords file: null


AttributeError: 'LuceneIndexer' object has no attribute 'index'

In [2]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher.from_prebuilt_index("msmarco-v1-passage")
hits = searcher.search("what is a lobster roll?")

for i in range(0, 10):
    print(f"{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}")

  from .autonotebook import tqdm as notebook_tqdm
9月 21, 2024 8:46:55 午前 org.apache.lucene.store.MMapDirectory lookupProvider
警告: You are running with Java 22 or later. To make full use of MMapDirectory, please update Apache Lucene.


 1 7157707 11.00830
 2 6034357 10.94310
 3 5837606 10.81740
 4 7157715 10.59820
 5 6034350 10.48360
 6 2900045 10.31190
 7 7157713 10.12300
 8 1584344 10.05290
 9 533614  9.96350
10 6234461 9.92200


In [3]:
from pyserini.search.lucene import LuceneImpactSearcher

searcher = LuceneImpactSearcher.from_prebuilt_index(
    "msmarco-v1-passage-unicoil", "castorini/unicoil-msmarco-passage"
)
hits = searcher.search("what is a lobster roll?")

for i in range(0, 10):
    print(f"{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}")

Attempting to initialize pre-built index msmarco-v1-passage-unicoil.
Unrecognized index name msmarco-v1-passage-unicoil


AttributeError: 'NoneType' object has no attribute 'search'

In [3]:
from pyserini.search.lucene import LuceneImpactSearcher

searcher = LuceneImpactSearcher.from_prebuilt_index("msmarco-v1-passage-unicoil")
hits = searcher.search("what is a lobster roll?")

for i in range(0, 10):
    print(f"{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}")

TypeError: LuceneImpactSearcher.from_prebuilt_index() missing 1 required positional argument: 'query_encoder'

In [4]:
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder

encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco")
searcher = FaissSearcher.from_prebuilt_index(
    "msmarco-passage-tct_colbert-hnsw", encoder
)
hits = searcher.search("what is a lobster roll")

for i in range(0, 10):
    print(f"{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}")



Attempting to initialize pre-built index msmarco-passage-tct_colbert-hnsw.
Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/faiss/faiss-hnsw.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz...


faiss-hnsw.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz: 31.1GB [5:24:47, 1.71MB/s]                               


Extracting /Users/tmina/.cache/pyserini/indexes/faiss-hnsw.msmarco-v1-passage.tct_colbert.20210112.be7119.tar.gz into /Users/tmina/.cache/pyserini/indexes/faiss-hnsw.msmarco-v1-passage.tct_colbert.20210112.be7119.6b7285a7f0163d1a547214396be20488...
Initializing msmarco-v1-passage.tct_colbert.hnsw...


: 

In [None]:
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder
from pyserini.search.hybrid import HybridSearcher
from pyserini.search.lucene import LuceneSearcher

ssearcher = LuceneSearcher.from_prebuilt_index("msmarco-v1-passage")
encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco")
dsearcher = FaissSearcher.from_prebuilt_index(
    "msmarco-passage-tct_colbert-hnsw", encoder
)
hsearcher = HybridSearcher(dsearcher, ssearcher)
hits = hsearcher.search("what is a lobster roll")

for i in range(0, 10):
    print(f"{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}")

# LangChain
- aa
- aa
- aa

In [None]:
from langchain_core.documents import Document

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS

# Documentオブジェクトのリストを作成
documents = [
    Document(page_content="LangChainは自然言語処理のためのフレームワークです。"),
    Document(page_content="FAISSは効率的な類似度検索を行うライブラリです。"),
    Document(page_content="BM25はキーワード検索アルゴリズムの一つです。"),
    Document(page_content="ハイブリッド検索は複数の検索手法を組み合わせます。"),
]

# BM25検索の設定
bm25_retriever = BM25Retriever.from_documents(documents)

# ベクトル検索の設定
embeddings = HuggingFaceEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
faiss_retriever = vectorstore.as_retriever()

# ハイブリッド検索の設定
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

# 検索の実行
query = "効率的な検索手法"
results = ensemble_retriever.get_relevant_documents(query)

for doc in results:
    print(doc.page_content)