In [6]:
import os
import re
import sys
import json
import glob
# from IPython.display import display, Markdown
sys.path.append("../")

from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.core import PromptTemplate
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings
import pandas as pd

from src.prompts import QA_GENERATION_PROMPT


load_dotenv()

JSON_PATTERN = r'\{[^}]*\}'

def read_file(file):
    with open(file, "r", encoding="utf-8") as f:
        contents = f.read()
        return contents

In [2]:
print(QA_GENERATION_PROMPT)

## Role
You are a Teacher/Professor. Your task is to setup ** Maximum 3 questions(0, 1, 2, 3) ** for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided.

## How to answer
1. Output in JSON format: [{"question": "", "A": "", "B": "", "C": "", "D": "", "answer": ""}]
2. Only write questions about the details of Dataiku or machine learning. If there are no questions to write, return an empty list (e.g., '[]').
3. All answers must be in capitalized English letters, such as A, B, C, D
4. Do not start with the markdown language, such as ```json, but with list opening brackets ([).
5. Don't write questions about things you think are unimportant or trivial. ** Ask questions as conservatively as possible**. Fewer questions are better.

## Examples
### Case1
[{"question": "Which of the following is not a possible way to categorize statistical tests?", "A": "1-sample tests vs. 2-sample tests.", "B

In [3]:
dataiku_files = [file for file in glob.glob("../data/dataiku/**/*", recursive=True) if os.path.isfile(file)]

llm = OpenAI(model="gpt-4o", temperature=0.1)


qa_dataset = []

for file in dataiku_files:
    contents = read_file(file)
    response = llm.complete(QA_GENERATION_PROMPT.replace("{context}", contents))
    
    try:
        matches = re.findall(JSON_PATTERN, response.text)
        
        for json_data in matches:
            qa = json.loads(json_data)
            qa.update({"file": file})
            qa_dataset.append(qa)

        print(f"{len(matches)} QA generated from {file}")
    except Exception as e:
        print(e)
        print(f"Error occured when generating QA from {file} ")
        continue


0 QA generated from ../data/dataiku\other.md
3 QA generated from ../data/dataiku\accessibility\index.md
1 QA generated from ../data/dataiku\api\index.md
3 QA generated from ../data/dataiku\api\js\index.md
1 QA generated from ../data/dataiku\api\scala\index.md
3 QA generated from ../data/dataiku\apinode\api-deployment-infrastructures.md
3 QA generated from ../data/dataiku\apinode\api-documentation.md
3 QA generated from ../data/dataiku\apinode\concepts.md
2 QA generated from ../data/dataiku\apinode\deploy-anywhere.md
3 QA generated from ../data/dataiku\apinode\endpoint-dataset-lookup.md
2 QA generated from ../data/dataiku\apinode\endpoint-mlflow.md
2 QA generated from ../data/dataiku\apinode\endpoint-python-function.md
3 QA generated from ../data/dataiku\apinode\endpoint-python-prediction.md
3 QA generated from ../data/dataiku\apinode\endpoint-r-function.md
3 QA generated from ../data/dataiku\apinode\endpoint-r-prediction.md
3 QA generated from ../data/dataiku\apinode\endpoint-sql-query

In [5]:
len(qa_dataset)

1978

In [8]:
qa_dataset = pd.DataFrame(qa_dataset)
qa_dataset.to_csv("../data/qa_dataset/dataiku_multiple_choice_qa.csv", index=False)

In [9]:
qa_dataset

Unnamed: 0,question,A,B,C,D,answer,file
0,Which keyboard shortcut in Dataiku DSS is used...,G + F,G + D,G + A,G + R,A,../data/dataiku\accessibility\index.md
1,"In Dataiku DSS, which shortcut allows you to t...",SHIFT + CLICK,SPACE,SHIFT + DRAG,Z,B,../data/dataiku\accessibility\index.md
2,What is the keyboard shortcut to validate a sc...,CTRL + ENTER,ALT + A,SHIFT + V,CTRL + F,A,../data/dataiku\accessibility\index.md
3,Which of the following APIs is NOT listed as a...,Javascript API,Scala API,Python API,R API,C,../data/dataiku\api\index.md
4,Which of the following sampling methods return...,HEAD,RANDOM,FULL,RANDOM-COLUMN,A,../data/dataiku\api\js\index.md
...,...,...,...,...,...,...,...
1973,Which of the following objects can be shared i...,Applications,Dashboards,Datasets,All of the above,D,../data/dataiku\workspaces\index.md
1974,Which of the following roles in a workspace ca...,Admins,Contributors,Members,All of the above,A,../data/dataiku\workspaces\managing.md
1975,What permission is granted to everyone in a wo...,Write,Read,Execute,Delete,B,../data/dataiku\workspaces\managing.md
1976,How can a user share DSS objects into a Dataik...,From the object’s Right Pane within the projec...,Only from the object’s Right Pane within the p...,Only from the (+) button within the workspace.,By sending an email to the workspace administr...,A,../data/dataiku\workspaces\sharing-to-workspac...
