In [4]:
import json
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from langchain.chat_models import ChatAnthropic
import os
from tqdm import tqdm
import pandas as pd
import torch
from transformers import AutoTokenizer
from langchain.vectorstores import FAISS
from langchain.schema.document import Document
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer

In [7]:
with open("/Users/lichenghu/Desktop/embedding_finetune/train_dataset.json") as jsonfile:
    train_dataset = json.load(jsonfile)

In [8]:
with open("/Users/lichenghu/Desktop/embedding_finetune/val_dataset.json") as jsonfile:
    test_dataset = json.load(jsonfile)

In [9]:
os.environ["ANTHROPIC_API_KEY"]=""

In [10]:
chat = ChatAnthropic(model='claude-2')

In [59]:
def evaluate(dataset,embed_model,top_k=5,verbose=False):
    corpus = dataset['docs']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']
    doc=[]
    for id_, text in corpus.items():
        doc.append(Document(page_content=text, metadata={'id': id_}))
    embedding = HuggingFaceEmbeddings(model_name=embed_model)
    db = FAISS.from_documents(doc, embedding)
    retriever = db.as_retriever(search_type="similarity",search_kwargs={'k': top_k})
    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_doc = retriever.get_relevant_documents(query)
        retrieved_ids = [doc.metadata['id'] for doc in retrieved_doc]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_ids,
            'expected': expected_id,
            'query': query_id,
        }
        eval_results.append(eval_result)
    return eval_results


In [7]:
# def evaluate(
#     dataset,
#     embed_model,
#     top_k=5,
#     verbose=False,
# ):
#     corpus = dataset['docs']
#     queries = dataset['queries']
#     relevant_docs = dataset['relevant_docs']

#     service_context = ServiceContext.from_defaults(embed_model=embed_model,llm=chat)
#     nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()] 
#     index = VectorStoreIndex(
#         nodes, 
#         service_context=service_context, 
#         show_progress=True
#     )
#     retriever = index.as_retriever(similarity_top_k=top_k)

#     eval_results = []
#     for query_id, query in tqdm(queries.items()):
#         retrieved_nodes = retriever.retrieve(query)
#         retrieved_ids = [node.node.node_id for node in retrieved_nodes]
#         expected_id = relevant_docs[query_id][0]
#         is_hit = expected_id in retrieved_ids  # assume 1 relevant doc
        
#         eval_result = {
#             'is_hit': is_hit,
#             'retrieved': retrieved_ids,
#             'expected': expected_id,
#             'query': query_id,
#         }
#         eval_results.append(eval_result)
#     return eval_results

In [9]:
def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset['docs']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    return evaluator(model, output_path='/Users/lichenghu/Desktop/embedding_finetune/')

In [60]:
instructor = "hkunlp/instructor-large"
instructor_val_results = evaluate(test_dataset, instructor)

Using Maximum Sequence Length:  512


100%|███████████████████████████████████████| 3084/3084 [04:07<00:00, 12.48it/s]


In [62]:
df_instructor = pd.DataFrame(instructor_val_results)

In [63]:
hit_rate_instructor = df_instructor['is_hit'].mean()
hit_rate_instructor

0.8281452658884566

In [12]:
hit_rate_instructor = df_instructor['is_hit'].mean()
hit_rate_instructor

0.8287937743190662

In [20]:
evaluate_st(test_dataset, "hkunlp/instructor-large", name='instructor')



Using Maximum Sequence Length:  512


0.7154374461035795

In [73]:
finetuned = "exp-finetune"
val_results_finetuned = evaluate(test_dataset, finetuned)

Using Maximum Sequence Length:  512


100%|███████████████████████████████████████| 3084/3084 [03:56<00:00, 13.02it/s]


In [22]:
device = torch.device('cpu')

In [74]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [75]:
hit_rate_finetuned = df_finetuned['is_hit'].mean()
hit_rate_finetuned

0.8913748378728924

In [21]:
evaluate_st(test_dataset, "local:exp_finetune", name='finetuned')

Using Maximum Sequence Length:  512


0.7875588064116248

In [22]:
df_instructor = pd.read_csv(
    "Information-Retrieval_evaluation_instructor_results.csv"
)
df_st_finetuned = pd.read_csv(
    "Information-Retrieval_evaluation_finetuned_results.csv"
)

In [23]:
df_instructor["model"] = "instructor"
df_st_finetuned["model"] = "fine_tuned"
df_st_all = pd.concat([df_instructor, df_st_finetuned])
df_st_all = df_st_all.set_index("model")
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
instructor,-1,-1,0.624514,0.778534,0.828145,0.873217,0.624514,0.624514,0.259511,0.778534,...,0.624514,0.259511,0.778534,0.165629,0.828145,0.087322,0.873217,0.71136,0.750888,0.715437
fine_tuned,-1,-1,0.705577,0.851167,0.891375,0.925422,0.705577,0.705577,0.283722,0.851167,...,0.705577,0.283722,0.851167,0.178275,0.891375,0.092542,0.925422,0.784921,0.81943,0.787559


In [76]:
df_instructor["model"] = "instructor"
df_finetuned["model"] = "fine_tuned"

In [78]:
df_all = pd.concat([df_instructor, df_finetuned])
df_all.groupby("model").mean("is_hit")

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
fine_tuned,0.891375
instructor,0.828145
