In [None]:
# Create an index via OpenAI embeddings
import os
import json
import re
import numpy as np
from PyPDF2 import PdfReader
import faiss
import nltk
from openai import OpenAI
from config import config
from dotenv import load_dotenv
load_dotenv()

# Download NLTK punkt tokenizer models
nltk.download('punkt')
client = OpenAI()

CHUNK_SIZE = 128
INDEX_FILE = 'utils/faiss_index.idx'  # Path to the index file
ID_MAP_FILE = 'utils/id_map.json'  # Path to the id_map file
folder_path = 'files/Papers_FullText'

def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text).strip()

def get_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

def get_embedding(sentence):
    response = client.embeddings.create(
        input=sentence,
        model="text-embedding-3-small"
    )
    return np.array(response.data[0].embedding, dtype='float32')

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, chunk_size=CHUNK_SIZE):
    """
    Splits the text into smaller chunks, each with a maximum size of chunk_size tokens.
    """
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        yield ' '.join(tokens[i:i + chunk_size])

def process_text(text, faiss_index, id_map):
    for chunk in chunk_text(clean_text(text)):
        sentences = get_sentences(chunk)
        for sentence in sentences:
            embedding = get_embedding(sentence)
            idx = faiss_index.ntotal
            faiss_index.add(np.array([embedding]))
            id_map[idx] = sentence

def process_folder(folder_path, faiss_index, id_map):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            text = read_pdf(file_path)
        elif file_name.endswith('.txt'):
            text = read_text_file(file_path)
        else:
            continue  # Skip other file formats
        process_text(text, faiss_index, id_map)

def save_faiss_index(faiss_index, file_name):
    faiss.write_index(faiss_index, file_name)

def load_faiss_index(file_name):
    return faiss.read_index(file_name)

def save_id_map(id_map, file_name):
    with open(file_name, 'w') as file:
        json.dump(id_map, file)

def load_id_map(file_name):
    with open(file_name, 'r') as file:
        return json.load(file)

def search_index(query_embedding, faiss_index, id_map, k):
    distances, indices = faiss_index.search(np.array([query_embedding]), k)
    return [(id_map[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

# Initialize FAISS index and ID map
dimension = 1536  # Adjust based on your model's output
if os.path.exists(INDEX_FILE) and os.path.exists(ID_MAP_FILE):
    faiss_index = load_faiss_index(INDEX_FILE)
    id_map = load_id_map(ID_MAP_FILE)
else:
    faiss_index = faiss.IndexFlatL2(dimension)
    id_map = {}

process_folder(folder_path, faiss_index, id_map)

if not os.path.exists(INDEX_FILE) or not os.path.exists(ID_MAP_FILE):
    save_faiss_index(faiss_index, INDEX_FILE)
    save_id_map(id_map, ID_MAP_FILE)


In [None]:
# LLM checks to write RAG query

import replicate
import pandas as pd
import json
import os
import config
import time
import importlib
importlib.reload(config)
from config import config, reset_config
from dotenv import load_dotenv
load_dotenv()
from config import config
config.set_mode("rag")

folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

df = pd.read_excel(config.questions)
df.to_excel(config.q_original, index=False)

df['Question'] = df['Question'].str.strip()  # Removes leading/trailing whitespace

# Check for duplicate questions
duplicates = df.duplicated(subset=['Question'], keep=False)
if duplicates.any():
    print("Duplicates found. Removing duplicates.")
    df = df.drop_duplicates(subset=['Question'], keep='first')
    df.to_excel(config.questions, index=False)
else:
    print("No duplicates found.")

# DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Question', 'Rewritten Question', 'Response', 'Latency'])

models = {
    # "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    # "falcon-40b": "joehoover/falcon-40b-instruct:7d58d6bddc53c23fa451c403b2b5373b1e0fa094e4e0d1b98c3d02931aa07173",
    "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    # "llama2-70b": "meta/llama-2-70b-chat",
    "noushermes2": "nateraw/nous-hermes-2-solar-10.7b:1e918ab6ffd5872c21fba21a511f344fd12ac0edff6302c9cd260395c7707ff4",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e",
    # "deepseek_33bq": "kcaverly/deepseek-coder-33b-instruct-gguf:ea964345066a8868e43aca432f314822660b72e29cab6b4b904b779014fe58fd",
    }

REWRITE_QUESTION = "You are a question creator, to perform semantic search over several academic papers to get the best answer to the given query. As the first step, please write one very clear QUESTION to answer the following question. It should be succinct."
SUMMARISE_ANSWER = "Summarise the given information into a succinct response in an easy to read format, eg bullet points."

prompt_for_qwen="""<|im_start|>system\n {INSTRUCTION}. {REWRITE_QUESTION} <|im_end|>\n<|im_start|>user\n {question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": "{INSTRUCTION}. {REWRITE_QUESTION}" 
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        qn = row['Question']
        question = json.dumps(qn)

        if model_key in ["yi-34b", "qwen-14b"]:
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=REWRITE_QUESTION, question=question)
        elif model_key == "noushermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=REWRITE_QUESTION, question=question)
        else:
            prompt = f"{INSTRUCTION}. {REWRITE_QUESTION}. {question}."

        start_time = time.time()  # Record the start time

        # USING THE QUESTION IN DOC, COME UP WITH A GOOD QUERY
        try:
            print(prompt)
            output = replicate.run(
                model_value,
                input={
                  "debug": False,
                #   "top_k": 50,
                  "top_p": 0.9,
                  "prompt": prompt,
                  "temperature": 0.7,
                  "max_new_tokens": 500,
                  "min_new_tokens": -1
                }
            )
            rewritten_qn = ""
            for item in output:
                item_str = str(item)#.strip()  # Convert item to string
                rewritten_qn += item_str # if len(item_str) == 1 else f" {item_str}"

            # response = response.strip()

        except Exception as e:
            rewritten_qn = f"Error: {e}"

        query_embedding = get_embedding(rewritten_qn)
        result = search_index(query_embedding, faiss_index, id_map,3)

        # USING THE QUESTION CREATED, NOW EMBED IT AND SUMMARISE THE ANSWER
        if model_key in ["yi-34b", "qwen-14b"]:
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=SUMMARISE_ANSWER, question=result)
        elif model_key == "noushermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=SUMMARISE_ANSWER, question=result)
        else:
            prompt = f"{INSTRUCTION}. {SUMMARISE_ANSWER}. Answer as asked:  {result}."

        try:
            print(f"Rewritten question is: {rewritten_qn}")
            output = replicate.run(
                model_value,
                input={
                  "debug": False,
                #   "top_k": 50,
                  "top_p": 0.9,
                  "prompt": prompt,
                  "temperature": 0.1,
                  "max_new_tokens": 500,
                  "min_new_tokens": -1
                }
            )
            response = ""
            for item in output:
                item_str = str(item)#.strip()  # Convert item to string
                response += item_str # if len(item_str) == 1 else f" {item_str}"

            # response = response.strip()
            print(f"Response is: {response}")

        except Exception as e:
            response = f"Error: {e}"

        end_time = time.time()  # Record the end time
        latency = end_time - start_time  # Calculate latency

        new_row = pd.DataFrame({'Model': [model_key], 'Question': [qn],'Rewritten Question': [rewritten_qn], 'Response': [response], 'Latency': [latency], 'Category': [row['Category']] , 'Type': [row['Type']]})
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        if index % 2 == 0:  # Save every 10 questions, adjust as needed
            results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')

results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')

In [None]:
# GPT-4 writes a document query

import pandas as pd
import json
import openai
import requests
from openai import OpenAI
import time
from dotenv import load_dotenv
load_dotenv()
import os
from config import config
config.set_mode("rag")

folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME
GPT_MODEL = config.GPT_MODEL
INPUT_CSV_PATH = config.questions
OUTPUT_CSV_PATH = config.gpt4results_csv_path
REWRITE_QUESTION = "You are a question creator, to perform semantic search over several academic papers to get the best answer to the given query. As the first step, please write one very clear QUESTION to answer the following question. It should be succinct."
SUMMARISE_ANSWER = "Summarise the given information into a succinct response in an easy to read format, eg bullet points."

client = OpenAI()
def show_json(obj):
    print(json.loads(obj.model_dump_json()))

assistant = client.beta.assistants.create(
    name=f"{F_NAME} AI Evaluator from RAG over documents",
    instructions=INSTRUCTION,
    model=GPT_MODEL,
)
show_json(assistant)

# Utility functions
def read_csv(file_path):
    return pd.read_excel(file_path)

def process_data_for_gpt(data):
    prompts = []
    for _, row in data.iterrows():
        question = row['Question']
        prompt = f"{REWRITE_QUESTION}: {question}."
        prompts.append(prompt)
    return prompts

def submit_message_and_create_questions(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=f"{REWRITE_QUESTION}: {prompt}")
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def submit_message_and_create_run(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    query_embedding = get_embedding(prompt)
    result = search_index(query_embedding, faiss_index, id_map,3)
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=f"{SUMMARISE_ANSWER}: {result}")
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def wait_on_run_and_get_response(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(0.5)
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    return [m.content[0].text.value for m in messages if m.role == 'assistant']

def create_output_csv(data, responses, rewritten_qn, latencies, model_name, interim_csv_path):
    new_rows = []
    for question, response, rewritten_q, latency, category, type in zip(data['Question'], responses, rewritten_qn, latencies, data['Category'], data['Type']):
        new_rows.append({'Model': model_name, 'Question': question, 'Rewritten Question': rewritten_q, 'Response': response, 'Latency': latency, 'Category': category, 'Type': type})
    new_data = pd.DataFrame(new_rows)
    new_data.to_excel(interim_csv_path, index=False)

data = read_csv(INPUT_CSV_PATH)
prompts = process_data_for_gpt(data)
ASSISTANT_ID = assistant.id

responses = []
rewritten_qn = []
latencies = []  # Initialize a list to store latencies

for prompt in prompts:
    start_time = time.time()  # Capture start time
    run1, thread1 = submit_message_and_create_questions(ASSISTANT_ID, prompt)
    response1 = wait_on_run_and_get_response(run1, thread1)
    rewritten_qn.append(response1)
    run, thread = submit_message_and_create_run(ASSISTANT_ID, response1)
    response = wait_on_run_and_get_response(run, thread)
    if isinstance(response, list):
        response = ' '.join(map(str, response))
    response = response.replace("\\\\n", "\\n")
    response = response.strip()
    print(response)
    responses.append(response)

    end_time = time.time()  # Capture end time
    latency = end_time - start_time  # Calculate latency
    latencies.append(latency)  # Store latency

create_output_csv(data, responses, rewritten_qn, latencies, GPT_MODEL, OUTPUT_CSV_PATH)