In [None]:
import json
import random

from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureChatOpenAI

from synthlume.pipeline.step import (
    DescriptionStep,
    GenerateQuestionStep,
    HumanifyQuestionStep,
    ScenarioQuestionStep,
    QuestionStyleSimpleStep,
    QuestionStyleCompleteSentenseStep,
    MultipleChoiceQuestionStep,
    GenerateMulticontextQuestionStep,
    GenerateQuestionWithEnhancedContextStep
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from synthlume.pipeline.utils.kmeans_sampling import KMeansSampling
from synthlume.pipeline.utils.spectral_sampling import SpectralSampling
from synthlume.pipeline.utils.one_cluster_sampling import OneClusterSampling


from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

## constants
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_ENDPOINT=os.getenv("AZURE_ENDPOINT")

In [None]:
def load_and_split(path, splitter):
    loader = PyPDFLoader(path)
    documents = loader.load()
    return splitter.split_documents(documents)

def generate_description(documents, llm, max_length):
    description_step = DescriptionStep(llm=llm, language="en")
    text = ""
    for document in documents:
        text += document.page_content
        if len(text) > max_length:
            break
    description = description_step.generate(document=text)

    return description

def generate_questions(llm, description, documents, output_file, filename, n_chunks):
    questions_generatoion_step = GenerateQuestionStep(llm=llm, language="en")
    scenario_question_step = ScenarioQuestionStep(llm=llm, language="en")
    humanify_question_step = HumanifyQuestionStep(llm=llm, language="en")
    question_style_simple_step = QuestionStyleSimpleStep(llm=llm, language="en")
    complete_sentence_step = QuestionStyleCompleteSentenseStep(llm=llm, language="en")
    multiple_choice_step = MultipleChoiceQuestionStep(llm=llm, language="en")

    results = []

    random.shuffle(documents)
    documents = documents[:min(len(documents), n_chunks)]

    for i, chunk in enumerate(documents):
        metadata = chunk.metadata
        chunk = chunk.page_content
        print(f"Chunk {i+1}/{len(documents)}")
        calls = {}

        inputs = {
            "context": chunk,
            "description": description,
            "current_document": metadata["source"]
        }

        calls["input"] = inputs

        response = multicontext_generation_step.generate(**inputs)

        if response is None:
            print(f"Could not generate question, skipping")
            continue

        calls[questions_generatoion_step.name] = response
        print(f"Base generated question: {calls[questions_generatoion_step.name]['question']}")

        response = multiple_choice_step.generate(**calls[questions_generatoion_step.name])
        if response is None:
            print(f"Could not generate multiple choice question, skipping")
        else:
            calls[multiple_choice_step.name] = response
            print(f"Multiple choice generated question: {calls[multiple_choice_step.name]['question']}")
            print(f"\tA) {calls[multiple_choice_step.name]['answer']}")
            print(f"\tB) {calls[multiple_choice_step.name]['wrong_answer_1']}")
            print(f"\tC) {calls[multiple_choice_step.name]['wrong_answer_2']}")
            print(f"\tD) {calls[multiple_choice_step.name]['wrong_answer_3']}")

        response = scenario_question_step.generate(**calls[questions_generatoion_step.name])
        if response is None:
            print(f"Could not generate scenario question, skipping")
        else:
            calls[scenario_question_step.name] = response
            print(f"Scenario generated question: {calls[scenario_question_step.name]['question']}")

        response = humanify_question_step.generate(**calls[questions_generatoion_step.name])
        if response is None:
            print(f"Could not generate human-like question, skipping")
        else:
            calls[humanify_question_step.name] = response
            print(f"Human-like generated question: {calls[humanify_question_step.name]['question']}")

        response = question_style_simple_step.generate(**calls[questions_generatoion_step.name])
        if response is None:
            print(f"Could not generate simple question, skipping")
        else:
            calls[question_style_simple_step.name] = response
            print(f"Simple generated question: {calls[question_style_simple_step.name]['question']}")

        response = complete_sentence_step.generate(**calls[questions_generatoion_step.name])
        if response is None:
            print(f"Could not generate complete sentence question, skipping")
        else:
            calls[complete_sentence_step.name] = response
            print(f"Complete sentence generated question: {calls[complete_sentence_step.name]['question']}")

        print()

        calls["filename"] = filename

        output_file.write(json.dumps(calls) + "\n")

        results.append(calls)
    
    return results

In [None]:

base_path = "../data/papers"
base_path_exclude = "../data/research_papers/exclude"
base_path_focus = "../data/research_papers/focus"
pdfs = [
    os.path.join(base_path, filename)
    for filename in os.listdir(base_path)
    if filename.endswith(".pdf")
]

#pdfs_exclude = [
#    os.path.join(base_path_exclude, filename)
#    for filename in os.listdir(base_path_exclude)
#    if filename.endswith(".pdf")
#]

#pdfs_focus = [
#    os.path.join(base_path_focus, filename)
#    for filename in os.listdir(base_path_focus)
#    if filename.endswith(".pdf")
#]



print(f"Discovered {len(pdfs)} PDFs")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2048,
    chunk_overlap=256,
    length_function=len,
    is_separator_regex=False,
)

all_documents = sum([load_and_split(pdf, text_splitter) for pdf in pdfs], [])
#all_documents_exclude = sum([load_and_split(pdf, text_splitter) for pdf in pdfs_exclude], [])
#all_documents_focus = sum([load_and_split(pdf, text_splitter) for pdf in pdfs_focus], [])

embeddings = AzureOpenAIEmbeddings(
    openai_api_key=AZURE_OPENAI_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    azure_deployment="embedding",
    openai_api_version="2024-05-01-preview",
)

llm = AzureChatOpenAI(
    openai_api_key=AZURE_OPENAI_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    openai_api_version="2024-05-01-preview",
    deployment_name=AZURE_DEPLOYMENT_NAME,
    temperature=0.9,
)


In [None]:
multicontext_generation_step = GenerateQuestionWithEnhancedContextStep(
    llm=llm,
    language="en",
    documents=all_documents,
    embeddings=embeddings,
    n_documents=4,
    min_distance=0.85,
    max_distance=0.98,
)

In [None]:
chunk_sampler = KMeansSampling(
    documents=all_documents,
    documents_exclude=None,
    documents_focus=None,
    sample_size=50,
    embeddings=embeddings,
    n_clusters_criterion = "BIC",
    #sample_size_for_BIC = 50,
    n_clusters = [2,3,4,6,8,10],
    sampling_method = 'stratified_sequence'
 )

chunk_iterator = iter(chunk_sampler)

for chunk in chunk_iterator:
    print(chunk.metadata)



chunk_sampler = SpectralSampling(
    documents=all_documents,
    documents_exclude=None,
    documents_focus=None,
    sample_size=50,
    embeddings=embeddings,
    n_clusters_criterion = "eigen_gap",
    knn_dim = [10,20,30,40,50,60],
    n_clusters = [2,3,4,6,8,10],
    sampling_method = 'cluster_sequence'
 )

chunk_iterator = iter(chunk_sampler)
cluster_chunks = [chunk for chunk in chunk_iterator if chunk.metadata['cluster_id']==0]

description = generate_description(cluster_chunks, llm, max_length=4097)
with open("questions_cluster.jsonl", "a") as output_file:
    results = generate_questions(
        llm, description, cluster_chunks, output_file, 'cluster', n_chunks=5
    )
    print(f"Generated {len(results)} questions for cluster")


chunk_sampler = OneClusterSampling(
    documents=all_documents,
    sample_size=50,
 )

chunk_iterator = iter(chunk_sampler)

for chunk in chunk_iterator:
    print(chunk.metadata)