## Installed packages

```python
pip install PyMuPDF
pip install Ollama

```

## Extract Data From PDF

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from each page of the given PDF file and returns it as a list.

    :param pdf_path: The path to the PDF file.
    :return: A list of strings, each containing the text from a page of the PDF.
    """
    # Open the PDF file
    document = fitz.open(pdf_path)
    text_list = []

    # Loop through each page
    for page_number in range(len(document)):
        page = document.load_page(page_number)
        text = page.get_text()
        text_list.append(text)

    # Close the document
    document.close()

    return text_list

pdf_path = "/Users/mosleh/Documents/VSPROJECTS/Dataset Creation from PDF/Tutorial_EDIT.pdf"
text_data = extract_text_from_pdf(pdf_path)


# Parsing Function

In [2]:
QA_pair_list = []

counter = 0
failed_gen_counter = 0

In [8]:
import json


def parse_coding_question_data(data):
    try:
        # Split the input string into question and answer parts
        parts = data.split("**Answer**")
        question_part = parts[0].strip()
        answer_part = parts[1].strip()

        # Extract the question text
        question_text = question_part.replace("**Question**", "").strip()

        # Remove any trailing code block markers
        if answer_part.startswith(':'):
            answer_part = answer_part[1:].strip()
        
        # Create JSON object
        question_json = {
            "user": question_text,
            "assistant": answer_part
        }

        return question_json
    except Exception as e:
        # Increment the counter for failed generations
        print(f"An error occurred: {e}")
        return None



# Inference Loop

In [None]:
import ollama

for page_number, page_text in enumerate(text_data, start=5):
     context = page_text
     
     response = ollama.chat(model='llama3.1', messages=[
         {
          'role': 'user',
          'content': f'Based upon this context{context}',
        },
         
        {
          'role': 'user',
          'content': f'Make up a Coding Question & Answer?',
        },
  
        {'role': 'user', 'content': 'Have standardized Sections like **Question** and **Answer**',}
        
        
      ])
     
     result = parse_coding_question_data(response['message']['content'])
     print(response['message']['content'])
     
     

     
     if result is not None:
        print("___________________")
        QA_pair_list.append(result)
        print(result)
        print("__________")  
     
     counter += 1
     
     if counter == 2:
        break   
      

# Saving Data

In [None]:
# File path for the JSONL file
file_path = 'data.jsonl'

# Writing JSON objects to a JSONL file
with open(file_path, 'w') as file:
    for item in QA_pair_list:
        json_line = json.dumps(item)  # Convert JSON object to a string
        file.write(json_line + '\n')  # Write the string to the file followed by a newline

In [None]:
#rint("Total tries for QA generation", counter)
#print("Failed tries for QA generation", failed_gen_counter)
        
        
     
     
     