In [15]:
#!pip install anthropic

In [34]:
import os
from anthropic import Anthropic

In [17]:
import pandas as pd

In [11]:
#os.environ['ANTHROPIC_API_KEY'] = ''

In [21]:
from glob import glob

# Use ** to match all files in all subdirectories
files = glob('../data/latest-docs/**/*.md', recursive=True)

In [22]:
files = pd.Series(files).rename('fp').to_frame()

In [29]:
files['fn'] = files.fp.str.rsplit("/",n=1).str[-1]#,expand=True)

In [32]:
files['category'] = files.fp.str.split("/",expand=True)[3]

In [35]:

client = Anthropic(
    # This is the default and can be omitted
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
)


In [47]:
# Prompt template
prompt_template = """
<prompt_template>
You will be acting as a coding assistant to help answer questions about the Gradio Python framework. I will provide you with a chunk of documentation about Gradio. Your task is to identify the core concepts covered in this documentation and generate questions that the documentation would be key in answering. Then, you will provide concise, reformatted answers to these questions based on the documentation.
You will only retun the jsonl question answer pair content no preamble.

Here is the chunk of Gradio documentation:
<documentation_chunk>
{{DOCUMENTATION_CHUNK}}
</documentation_chunk>

Please carefully read through the documentation chunk and identify the core concepts it covers. For each core concept, simulate a question that a user might ask where this part of the documentation would be essential to providing a good answer. Try to generate at least one question per core concept.

Once you have your list of questions, answer each one by reformatting the relevant parts of the documentation into an optimal, concise answer. Focus on capturing the key information needed to address the question, but don't simply copy and paste from the documentation - aim to rephrase things in a more accessible way.

Please return your output in JSONL format, with each line containing a JSON object representing a simulated question and answer pair. The JSON object should have a "question" field with the simulated question, and an "answer" field with the concise, reformatted answer you generated from the documentation.
</prompt_template>
"""



In [38]:
import re

def read_markdown(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def parse_markdown(content):
    # Regex pattern to match markdown headings
    heading_pattern = re.compile(r'^(#{1,6})\s+(.*)', re.MULTILINE)
    headings = []
    for match in heading_pattern.finditer(content):
        level = len(match.group(1))
        title = match.group(2).strip()
        start = match.start()
        headings.append((level, title, start))
    return headings

def get_text_chunks(content, headings):
    chunks = []
    for i in range(len(headings)):
        level, title, start = headings[i]
        if i + 1 < len(headings):
            end = headings[i + 1][2]
        else:
            end = len(content)
        chunk = content[start:end].strip()
        chunks.append((level, title, chunk))
    return chunks

def process_chunks(chunks):
    hierarchy = {1: None, 2: None, 3: None, 4: None, 5: None, 6: None}
    processed_chunks = []

    for level, title, chunk in chunks:
        hierarchy[level] = title
        upper_headings = [hierarchy[i] for i in range(1, level) if hierarchy[i] is not None]
        full_title = " > ".join(upper_headings + [title])
        processed_chunks.append((full_title, chunk))

    return processed_chunks



In [41]:
all_chunks = []
for index, row in files.iterrows():
    content = read_markdown(row.fp)
    headings = parse_markdown(content)
    chunks = get_text_chunks(content, headings)
    processed_chunks = process_chunks(chunks)
    
    for title, chunk in processed_chunks:
        all_chunks.append(f"Heading: {title}\n Content:\n{chunk}\n")
        #all_chunks.append(f"Category: {row.category}\n Heading: {title}\n Content:\n{chunk}\n") # maybe the folder name might be valueable context?
        

In [43]:
#chunk = all_chunks[0]

In [50]:
from tqdm import tqdm

In [None]:
# List to store the responses
responses = []
# Iterate through the documentation chunks
counter_incase_crash = 0

In [None]:
for chunk in tqdm(all_chunks):
    # Insert the chunk into the prompt template
    prompt = prompt_template.replace("{{DOCUMENTATION_CHUNK}}", chunk)
    
    # Send the prompt to Claude Opus
    response = client.messages.create(
    max_tokens=2048,
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
    model="claude-3-opus-20240229",
    )
    
    # Get the assistant's response
    assistant_response = response.content
    
    # Append the response to the list
    responses.append(assistant_response)
    counter_incase_crash+=1

In [None]:
# Print the responses
for response in responses:
    print(response)

In [63]:
progress=f'{counter_incase_crash}of{len(all_chunks)}'
progress

'306of553'

In [60]:
os.getcwd()

'/mnt/c/users/MitchellBaskerville/code/gradio-fine-tuning/notebooks'

In [105]:
header_names = pd.Series([all_chunks[i].split('\n')[0].replace('Heading: ','') for i in range(len(all_chunks))]).rename('header_names')

In [108]:
responses_df = pd.Series(responses).rename('all_data')
pd.concat([header_names,responses_df],axis=1).astype(str).to_parquet(os.getcwd() + f'/../datasets/raw/opus_doc_qas{progress}.parquet')

In [98]:
output_json_filepath = os.getcwd() + f'/../datasets/qa_pairs/opus_doc_qas_{progress}.jsonl'

In [99]:
with open(output_json_filepath, 'w') as jsonl_file:
    for index, opus_response in responses_df.items():
        for jsl in opus_response[0].text.split('\n'):
            jsl=jsl.replace('\\','').replace('"','`').replace("`question`: `",'"question": "').replace("`, `answer`: `",'", "answer": "').replace("`}",'"}')
            jsonl_file.write(jsl + '\n')