# Generate Dataset from Your Documentation

![image](./imgs/‎GenAIEnterprises.‎012.png)

![image](./imgs/‎GenAIEnterprises.‎013.png)

![image](./imgs/‎GenAIEnterprises.‎014.png)

# Initial Setup

In [7]:
import boto3
import os
import json
import openai
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from IPython.display import display, Markdown
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

def get_api_key(ssm_client, parameter_path):
    '''Get the OpenAI API key from the SSM Parameter Store'''
    try:
        response = ssm_client.get_parameter(
            Name=parameter_path,
            WithDecryption=True
        )
        return response['Parameter']['Value']
    except ssm_client.exceptions.ParameterNotFound:
        raise Exception(f'Parameter {parameter_path} not found in SSM Parameter Store')

# Create an SSM client using Boto3
region_name = os.getenv('AWS_REGION', 'us-east-1') 
ssm = boto3.client('ssm', region_name=region_name)

openai_api_key = get_api_key(ssm_client=ssm, parameter_path='/openai/api_key')
langchain_api_key = get_api_key(ssm_client=ssm, parameter_path='/langchain/api_key')


os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ["LANGCHAIN_API_KEY"] = langchain_api_key
openai.api_key = openai_api_key

# Set the model variable based on the current date
llm_model = "gpt-3.5-turbo-16k"

# Create the vector store and embedding function
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory='docs/chroma/',
    embedding_function=embedding
)

In [3]:
from llama_index import SimpleDirectoryReader

required_exts = [".md"]

reader = SimpleDirectoryReader(
    input_dir="./handbook", required_exts=required_exts, recursive=True
)

docs = reader.load_data()
print(f"Loaded {len(docs)} docs")

from llama_index import Document
doc_text = "\n\n".join([d.get_content() for d in docs])
print(f"Total length of all docs: {len(doc_text)}")

metadata = {"paper_title": "The Made Tech open source company handbook"}
docs = [Document(text=doc_text, metadata=metadata)]


Loaded 1351 docs
Total length of all docs: 621737


# Generate Dataset

Code from:

https://gpt-index.readthedocs.io/en/latest/examples/finetuning/knowledge/finetune_knowledge.html

In [4]:
from llama_index.callbacks import CallbackManager

callback_manager = CallbackManager([])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3),
    callback_manager=callback_manager,
)
gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4-0613", temperature=0.3), callback_manager=callback_manager
)

In [6]:
from llama_index.evaluation import DatasetGenerator
from llama_index.node_parser import SimpleNodeParser
from llama_index.indices.list import SummaryIndex

# try evaluation modules
from llama_index.evaluation import QueryResponseEvaluator, ResponseEvaluator
from llama_index import PromptTemplate

node_parser = SimpleNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(docs)

In [7]:
from tqdm.notebook import tqdm
import json

num_questions_per_chunk = 10
question_gen_query = (
    "You are a Teacher/ Professor. Your task is to setup "
    "a quiz/examination. Using the provided context, "
    f"formulate {num_questions_per_chunk} that captures an important fact from the "
    "context. \n"
    "You MUST obey the following criteria:\n"
    "- Restrict the question to the context information provided.\n"
    "- Do NOT create a question that cannot be answered from the context.\n"
    "- Phrase the question so that it does NOT refer to specific context. "
    'For instance, do NOT put phrases like "given provided context" or "in this work" in the question, '
    "because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms "
    "with specific details.\n"
    "BAD questions:\n"
    "What did the author do in his childhood\n"
    "What were the main findings in this report\n\n"
    "GOOD questions:\n"
    "What did Barack Obama do in his childhood\n"
    "What were the main findings in the original Transformers paper by Vaswani et al.\n\n"
    "Generate the questions below:\n"
)

# go through each node one at a time -
# generate questions, filter using eval modules, and dump to file

fp = open("data/qa_pairs.jsonl", "a")
for idx, node in enumerate(nodes[30:31]):
    dataset_generator = DatasetGenerator(
        [node],
        question_gen_query=question_gen_query,
        service_context=gpt_4_context,
        metadata_mode="all",
    )
    node_questions_0 = dataset_generator.generate_questions_from_nodes(num=10)
    print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
    # for each question, get a response
    for question in tqdm(node_questions_0):
        index = SummaryIndex([node], service_context=gpt_35_context)
        query_engine = index.as_query_engine()
        response = query_engine.query(question)
        out_dict = {"query": question, "response": str(response)}
        print(f"[Node {idx}] Outputs: {out_dict}")
        fp.write(json.dumps(out_dict) + "\n")

fp.close()

[Node 0] Generated questions:
 ['What is the time frame within which employees at Made Tech will receive their rewards?', 'How soon will the TA team contact an individual after receiving a referral profile at Made Tech?', 'If the same candidate is referred by multiple employees at Made Tech, who will receive the reward?', 'What is the time limit for a referred candidate at Made Tech?', 'What happens to the reward if an employee leaves Made Tech?', 'Who is responsible for the IT infrastructure at Made Tech?', 'What are the standard laptop specifications for Engineers and UCD roles at Made Tech?', 'What are the standard laptop specifications for all other roles at Made Tech?', 'How often are laptops replaced at Made Tech?', 'What happens to the old laptops once they are returned at Made Tech?']


  0%|          | 0/10 [00:00<?, ?it/s]

[Node 0] Outputs: {'query': 'What is the time frame within which employees at Made Tech will receive their rewards?', 'response': 'Employees at Made Tech will receive their rewards within 30 days of each stage.'}
[Node 0] Outputs: {'query': 'How soon will the TA team contact an individual after receiving a referral profile at Made Tech?', 'response': 'The TA team at Made Tech will contact an individual within 72 hours of receiving a referral profile.'}
[Node 0] Outputs: {'query': 'If the same candidate is referred by multiple employees at Made Tech, who will receive the reward?', 'response': 'The first employee who refers the candidate will receive the reward.'}
[Node 0] Outputs: {'query': 'What is the time limit for a referred candidate at Made Tech?', 'response': 'The time limit for a referred candidate at Made Tech is 12 months.'}
[Node 0] Outputs: {'query': 'What happens to the reward if an employee leaves Made Tech?', 'response': 'The employee will not receive a payment if they le

# Filter out questions using QueryResponseEvaluator

In [8]:
# try evaluation modules
from llama_index.evaluation import QueryResponseEvaluator, ResponseEvaluator
from llama_index import PromptTemplate
from llama_index.llms import OpenAI


query_eval_tmpl = PromptTemplate(
    "Your task is to evaluate the following: If the response for the query isn't able to answer the question provided.\n"
    "If query isn't able to answer the question, answer NO.\n"
    "Otherwise answer YES.\n"
    "To elaborate, you might get an answer like the following: 'The context does not contain the answer to this question.'"
    "Please return NO in that case. "
    "You be given the query and response. Return YES or NO as the answer.\n"
    "Query: \n {query_str}\n"
    "Response: \n {response_str}\n"
    "Answer: "
)

eval_llm = OpenAI(model="gpt-4-0613")

def filter_data(path: str, out_path: str):
    fp = open(path, "r")
    out_fp = open(out_path, "w")
    new_lines = []
    for idx, line in enumerate(fp):
        qa_pair = json.loads(line)
        eval = eval_llm.complete(
            query_eval_tmpl.format(
                query_str=qa_pair["query"], response_str=qa_pair["response"]
            )
        )

        print(f"[{idx}] QA Pair: {qa_pair} \n Eval: {eval}")
        if "NO" in eval:
            continue
        else:
            # new_lines.append(line)
            out_fp.write(line)

In [9]:
filter_data("data/qa_pairs.jsonl", "data/qa_pairs_2.jsonl")

[0] QA Pair: {'query': 'What is the mission of Made Tech as described in their handbook?', 'response': 'The mission of Made Tech, as described in their handbook, is to positively impact the future of the country by using technology to improve society. They work with public sector organizations to modernize technology and accelerate digital delivery so that citizens can benefit from better public services.'} 
 Eval: YES
[1] QA Pair: {'query': 'What is the purpose of the Made Tech Handbook for new team members?', 'response': 'The purpose of the Made Tech Handbook for new team members is to provide them with an overview of the company, including why it exists and its mission to positively impact society through technology. It also outlines the roles within the company, details the benefits and perks offered to employees, and provides information on staff welfare, team norms, and company processes, policies, and resources. Overall, the handbook serves as a starting point for new team membe

KeyboardInterrupt: 

# Split into Training and Validation Sets

In [9]:
from copy import deepcopy
import random


def split_train_val(path: str, out_train_path: str, out_val_path: str, train_split=0.7):
    with open(path, "r") as fp:
        lines = fp.readlines()

        # shuffle the lines to make sure that the "train questions" cover most fo the context
        shuffled_lines = deepcopy(lines)
        random.shuffle(shuffled_lines)

        split_idx = int(train_split * len(shuffled_lines))
        train_lines = shuffled_lines[:split_idx]
        val_lines = shuffled_lines[split_idx:]
        with open(out_train_path, "w") as out_fp:
            out_fp.write("".join(train_lines))

        with open(out_val_path, "w") as out_fp:
            out_fp.write("".join(val_lines))

In [13]:
split_train_val(
    "data/qa_pairs_2.jsonl", "data/qa_pairs_train.jsonl", "data/qa_pairs_val.jsonl"
)

# Format into Training Data

In [15]:
fp = open("data/qa_pairs_train.jsonl", "r")
out_fp = open("data/qa_pairs_openai.jsonl", "w")
# TODO: try with different system prompts
system_prompt = {
    "role": "system",
    "content": "Provide answers to questions based on the company handbook to help employees quickly find the information they need. Ensure that your responses are concise and directly address the questions asked without providing additional information.",
}
for line in fp:
    qa_pair = json.loads(line)
    user_prompt = {"role": "user", "content": qa_pair["query"]}
    assistant_prompt = {"role": "assistant", "content": qa_pair["response"]}
    out_dict = {
        "messages": [system_prompt, user_prompt, assistant_prompt],
    }
    out_fp.write(json.dumps(out_dict) + "\n")