In [1]:
import time

import numpy as np

from sentence_transformers import SentenceTransformer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('ggrn/e5-small-v2')

In [4]:
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

In [5]:
with open('textbook.txt', 'r') as f:
    textbook = f.read()

In [6]:
chunk_size = 2048

chunks = [textbook[i:i+chunk_size] for i in range(0, len(textbook), chunk_size)]

print(len(chunks))

19


In [8]:
def get_text_embedding(input):
    embeddings_batch_response = model.encode(input, normalize_embeddings=True)
    return embeddings_batch_response

start = time.time()
text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
end = time.time()

print(f"Time taken to vectorize using E5-small-V2: {round(end-start, 2)}s")

Time taken to vectorize using E5-small-V2: 3.31s


In [7]:
def get_text_embedding(input):
    embeddings_batch_response = model.encode(input, normalize_embeddings=True)
    return embeddings_batch_response

start = time.time()
text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
end = time.time()

print(f"Time taken to vectorize using BGE-small: {round(end-start, 2)}s")

Time taken to vectorize using BGE-small: 3.57s


In [8]:
import faiss

d = text_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(text_embeddings)

In [9]:
questions = []

with open("questions.txt") as f:
    for line in f:
        if line.strip():
            questions.append(line.strip())

In [10]:
chatAgent = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

In [12]:
predicted_answers = []

start = time.time()
for i, question in enumerate(questions):
    print(f'Answering question: {i + 1}')
    question_embeddings = np.array([get_text_embedding(question)])
    D, I = index.search(question_embeddings, k=2)
    retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
    prompt = f"""
        Context information is below.
        ---------------------
        {retrieved_chunk}
        ---------------------
        Given the context information and not prior knowledge, answer the query.
        Query: {question}
        Answer:
        """
    output = chatAgent(prompt, max_new_tokens=256, do_sample=True, temperature=0.1, top_k=30, top_p=0.95)
    output = output[0]['generated_text']
    predicted_answers.append(output)

end = time.time()
print(f"Time taken to answer questions on E5-small-V2: {round(end-start, 2)}s")

Answering question: 1
Answering question: 2
Answering question: 3
Answering question: 4
Answering question: 5
Answering question: 6
Answering question: 7
Answering question: 8
Time taken to answer questions on E5-small-V2: 964.73s


In [11]:
predicted_answers = []

start = time.time()
for i, question in enumerate(questions):
    print(f'Answering question: {i + 1}')
    question_embeddings = np.array([get_text_embedding(question)])
    D, I = index.search(question_embeddings, k=2)
    retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
    prompt = f"""
        Context information is below.
        ---------------------
        {retrieved_chunk}
        ---------------------
        Given the context information and not prior knowledge, answer the query.
        Query: {question}
        Answer:
        """
    output = chatAgent(prompt, max_new_tokens=256, do_sample=True, temperature=0.1, top_k=30, top_p=0.95)
    output = output[0]['generated_text']
    predicted_answers.append(output)

end = time.time()
print(f"Time taken to answer questions on BGE-small: {round(end-start, 2)}s")

Answering question: 1
Answering question: 2
Answering question: 3
Answering question: 4
Answering question: 5
Answering question: 6
Answering question: 7
Answering question: 8
Time taken to answer questions on BGE-small: 1086.67s


In [12]:
real_answers = []

with open('actual.txt', 'r') as f:
    text = f.read()

for t in text.split('---'):
    real_answers.append(t)

In [13]:
predicted_answers_processed = []

for answer in predicted_answers:
    for i, word in enumerate(answer.split()):
        if word == 'Answer:':
            predicted_answers_processed.append(' '.join(answer.split()[i+1:]))

In [14]:
real_answers_embeddings = np.array([get_text_embedding(answer) for answer in real_answers])
predicted_answers_embeddings = np.array([get_text_embedding(answer) for answer in predicted_answers_processed])

In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = []
euclidean_similarities = []

for i in range(len(real_answers)):
    cosine_sim = cosine_similarity([real_answers_embeddings[i]], [predicted_answers_embeddings[i]])[0][0]
    cosine_similarities.append(cosine_sim)

cosine_similarities = np.array(cosine_similarities)

In [16]:
with open('predicted.txt', 'w') as f:
    for answer in predicted_answers_processed:
        f.write(f'{answer}\n\n\n')

In [17]:
cosine_similarities.mean()

0.84720886