In [None]:
import os
import re
import openai
import csv
from tqdm import tqdm

# Configuration
endpoint = ""
api_key = ""
model = "gpt-4o-mini"


# Define the directory containing the markdown files
directory = 'repo/data/markdown'

# Define the maximum number of files to process
max_files = 5000  # Change this value to the desired maximum number of files

# Initialize a counter for processed files
processed_files = 0

# CSV file output
output_csv = 'questions.csv'
open('questions.csv', 'w').close()

def process_question(content):
    client = openai.AzureOpenAI(
        azure_endpoint=endpoint,
        api_key=api_key,
        api_version="2024-08-01-preview"
)
    message_text = [
        {"role": "system", "content": "Du bist ein AI Assistent zur Erstellung von Fragen zu Textabschnitten. Du erhälst als content den entsprechenden Text. Die Frage sollte so gestellt werden, dass das übergreifende Thema klar aus der Frage selbst hervorgeht."},
        {"role": "user", "content": "Stelle genau eine Frage zum Inhalt dieses Textes:" + content},
    ]

    completion = client.chat.completions.create(
        model=model,  # Ensure the model name is correct
        messages=message_text,
        max_tokens=500
    )
    return completion.choices[0].message.content

def write_to_csv(question, chunk, output_csv):
    with open(output_csv, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')
        # Write header if the file is empty
        chunk = chunk.replace('#', '') 
        if file.tell() == 0:
            writer.writerow(['Question', 'Chunk'])
        writer.writerow([question, chunk])

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.md'):  # Check if the file is a markdown file
        if processed_files >= max_files:
            break  # Stop processing if the maximum number of files is reached
        
        file_path = os.path.join(directory, filename)
        
        # Open and read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Split the content into chunks based on "##" headers
        chunks = re.split(r'(?=## )', content)
        
        # Process each chunk
        for chunk in tqdm(chunks):
            question = process_question(chunk)
           # print(question)
           #  print('---')  # Separator between chunks
            
            # Write the chunk and question to the CSV file
            write_to_csv(question, chunk, output_csv)
        
        # Increment the processed files counter
        processed_files += 1