In [1]:
!uv init
!uv add cohere
!uv add numpy
!uv add pandas
!uv add tqdm
!uv add faiss-cpu
!uv add scikit-learn
!uv add rank_bm25

Adding `[36mipynb[39m` as member of workspace `[36m/Users/ohmycloud/opt/learning/llm/hands-on-large-language-model[39m`
Initialized project `[36mipynb[39m`
[2K[2mResolved [1m151 packages[0m [2min 956ms[0m[0m                                       [0m
[2K[37m⠙[0m [2mpandas==2.3.3                                                                 [0m[2mAudited [1m27 packages[0m [2min 6ms[0m[0m
[2K[37m⠙[0m [2mpandas==2.3.3                                                                 [0m[2mResolved [1m151 packages[0m [2min 32ms[0m[0m
[2K[2mAudited [1m28 packages[0m [2min 1ms[0m[0m                                           [0m
[2K[2mResolved [1m151 packages[0m [2min 34ms[0m[0m                                        [0m
[2K[2mAudited [1m33 packages[0m [2min 2ms[0m[0m                                           [0m
[2K[37m⠙[0m [2mpandas==2.3.3                                                                 [0m[2mResolved [1m151 p

In [2]:
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm

api_key = 'y7op4K966Iz6l1o3JstVqzaaXdEV0NJvpN9czFUk'
co = cohere.Client(api_key)

text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.
Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.
Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subse quent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

# 将文本分割成句子列表
texts = text.split('.')
# 清理空格和换行符
texts = [t.strip(' \n') for t in texts]

In [3]:
# 获取嵌入向量
response = co.embed(
    texts=texts,
    input_type="search_document"
).embeddings

embeds = np.array(response)

# 输出嵌入向量的形状
print(embeds.shape) 

(15, 4096)


In [4]:
import faiss

dim = embeds.shape[1]           # 嵌入向量的维度
index = faiss.IndexFlatL2(dim)  # 使用L2距离度量
print(index.is_trained)
index.add(np.float32(embeds))   # 添加嵌入向量到索引

True


In [5]:
def search(query, number_of_results=3):
    # 1. 获取查询的嵌入向量
    query_embed = co.embed(texts=[query], input_type="search_query",).embeddings[0]
    # 2. 检索最近邻
    distances, similar_item_ids = index.search(np.float32([query_embed]), number_of_results)
    # 3. 格式化结果
    texts_np = np.array(texts) # 将文本列表转换为numpy数组以便索引
    results = pd.DataFrame(data={'texts': texts_np[similar_item_ids[0]], 'distance': distances[0]})
    # 4. 打印并返回结果
    print(f"Query:'{query}'\nNearest neighbors:")
    return results

In [6]:
query = "how precise was the science"
results = search(query)
results

Query:'how precise was the science'
Nearest neighbors:


Unnamed: 0,texts,distance
0,It has also received praise from many astronom...,10757.366211
1,Caltech theoretical physicist and 2017 Nobel l...,11566.133789
2,Interstellar uses extensive practical and mini...,11922.839844


In [7]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)
        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


tokenized_corpus = []
for passage in tqdm(texts):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

def keyword_search(query, top_k=3, num_candidates=15):
    print("Input question:", query)
    ##### BM25搜索（词汇搜索） #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print(f"Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

keyword_search(query = "how precise was the science")

100%|██████████| 15/15 [00:00<00:00, 47771.12it/s]

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.793	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.377	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine





In [8]:
query = "how precise was the science"
results = co.rerank(query=query, documents=texts, top_n=3, return_documents=True)
results.results

[RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text='It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics'), index=12, relevance_score=0.15239799),
 RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text='The film had a worldwide gross over $677 million (and $773 million with subse quent re-releases), making it the tenth-highest grossing film of 2014'), index=10, relevance_score=0.05086082),
 RerankResponseResultsItem(document=RerankResponseResultsItemDocument(text='Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan'), index=0, relevance_score=0.0350424)]

In [9]:
for idx, result in enumerate(results.results):
    print(idx, result.relevance_score , result.document.text)

0 0.15239799 It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics
1 0.05086082 The film had a worldwide gross over $677 million (and $773 million with subse quent re-releases), making it the tenth-highest grossing film of 2014
2 0.0350424 Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan


In [11]:
def keyword_and_reranking_search(query, top_k=3, num_candidates=10):
    print("Input question:", query)
    ##### BM25搜索（词汇搜索）#####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

    print(f"Top-3 lexical search (BM25) hits")

    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

    # 添加重排序
    docs = [texts[hit['corpus_id']] for hit in bm25_hits]
    print(f"\nTop-3 hits by rank-API ({len(bm25_hits)} BM25 hits re-ranked)")
    
    results = co.rerank(query=query, documents=docs, top_n=top_k, return_documents=True)
    # print(results.results)
    for hit in results.results:
        # print(hit)
        print("\t{:.3f}\t{}".format(hit.relevance_score, hit.document.text.replace("\n", " ")))

In [16]:
keyword_and_reranking_search(query = "how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.793	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.377	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects

Top-3 hits by rank-API (10 BM25 hits re-ranked)
	0.035	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	0.032	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine
	0.031	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of In

In [12]:
query = "income generated"
# 1.检索
# 我们将使用嵌入式搜索，但理想情况下应该使用混合搜索
results = search(query)
# 2.基于知识的生成
docs_dict = [{'text': text} for text in results['texts']]
response = co.chat(
    message = query,
    documents=docs_dict
)
print(response.text)

Query:'income generated'
Nearest neighbors:
The film generated a worldwide gross of over $677 million, and $773 million with subsequent re-releases.
