<a href="https://colab.research.google.com/github/neomatrix369/learning-path-index/blob/advanced-rag/LPI_QA_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook demonstrates
* Colab notebook authenticating to a GCP account (with specific project_id and region )
* Tests prompting Gemini models via VertexAI
* Test QA on a TEXT sent with prompt


# Install required packages

In [None]:
%%time
!pip install --upgrade -q google-cloud-aiplatform
!pip install -q google-colab
!pip install -q  loguru

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCPU times: user 180 ms, sys: 24.8 ms, total: 205 ms
Wall time: 22.4 s


# Import required packages

In [None]:
from google.colab import userdata
from google.colab import auth
from google.cloud import aiplatform
from pathlib import Path
import os
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from loguru import logger
import json
import time

# Authenticate to GCP account

In [None]:
%%time
PROJECT_ID = "generated-mote-434518-u9" # Set this to a valid project id in your google cloud account
REGION = 'us-central1' # Set this to the GCP region you want to use
auth.authenticate_user(project_id=PROJECT_ID) # First time, this might prompt you to login to your google cloud account
!gcloud config get core/account # This will validate if your authentication was successful and echo the login email
!gcloud config get-value project # Verify by echo'ing the project config
!gcloud config set compute/region {REGION} # Set the region
!gcloud config get-value compute/region # Verify by echo'ing the region config

hirejorgecampos@gmail.com
generated-mote-434518-u9
Updated property [compute/region].
us-central1
CPU times: user 405 ms, sys: 50.5 ms, total: 455 ms
Wall time: 29.1 s


# **Send a prompt and Text File To Gemini 1.5**

---



## Creating the Prompt and system instructions


In [None]:
prompt = """Using the following list of course files, generate a dataset of 10 question-answer pairs in JSON format. Each question should reflect a student\'s perspective, seeking courses to learn machine learning and AI concepts at various proficiency levels. Include questions from beginners, intermediates, and advanced learners interested in areas like LLMs, NLP, RAG pipelines, fine-tuning models, and vector databases. Ensure the answers provide a tailored set of recommended courses that match the student\'s learning objectives and skill level.
The format for each question-answer pair should be as follows:
{"Question":"I am a beginner and want to learn about NLP. Which courses should I take to understand the fundamentals?","Answer":"Given your beginner level and interest in NLP, I recommend the following courses and modules:\\n\\n| Course/Module | Source | Level | Duration (Estimate) | Keywords | Reason |\\n|---------------|--------|-------|----------------------|----------|--------|\\n| **NLP Basics** | Coursera | Beginner | 4-6 weeks | NLP, text preprocessing, tokenization, embeddings | Covers foundational NLP techniques for building text processing models. |\\n| **Python for Data Science** | DataCamp | Beginner | 4-8 weeks | Python, data manipulation, basic ML | Essential for understanding coding basics required in NLP workflows. |\"}"""

LLM_instructions = """You task is to create a syntenic conversational dataset from a dataset containing Machine Learning courses from various sites.
Diversity in Questions:
Include questions at varying skill levels (beginner, intermediate, advanced).
Address specific interests (e.g., NLP, fine-tuning LLMs, vector databases, retrieval techniques).
Incorporate scenarios like wanting to focus on a specific machine learning concept or pipeline.
Answer Structure:
Use a table to list multiple recommended courses/modules.
For each course, include:
Course/Module Name
Source (e.g., Coursera, edX, official documentation)
Level (Beginner, Intermediate, Advanced)
Duration Estimate (e.g., 4-6 weeks, 10 hours)
Keywords (relevant topics covered)
Reason (brief explanation of why the course fits the student’s need)
Ensure Answer Completeness:
For each answer, list at least 3-5 relevant courses that progressively cover the required concepts.
Each course/module should be aligned with the student’s stated goals and background level.
JSON Output Format:
The final output should be a JSON array containing \'x\' amount of qa pairs asked by the user"""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0.5,
    "top_p": 1,
}

## Creating Function to check dataset size

In [None]:
def size_of_file(file: str) -> int:
    """
    This function assumes a JSONL file and returns how many rows are in the file.
    """

        # Check if the file exists
    if not os.path.exists(file):
        print(f"File '{file}' not found.")
        return 0

    # Reopen the file in read mode to check the size
    with open(file, "r") as f:
        lines = f.readlines()
    return len(lines)


## Main Function script and call to vertex ai


In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from loguru import logger
import json
import time
def multiturn_generate_content(
    output_path:str,
    num_sets:int,
    ) -> None:
    """
    This function generates a q-a dataset for a given input file using a gemini variant model and returns the response.
    Args:
        output_path (str): The path to the output file.
        num_sets (int): The number of q-a sets to generate per file.

    Returns:
        None

    """

    vertexai.init(project="generated-mote-434518-u9", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-pro-001",
        system_instruction=[LLM_instructions]
    )

    dataset_size = 0
    iteration = 0
    start_time = time.time()

    # Loop until the dataset size is num_sets

    while dataset_size < num_sets:

      # Sending call to api can i start chat once would that improve speed? I'm  I reinitializing the call?
      chat = model.start_chat(response_validation=False)
      response = chat.send_message(
              [document, prompt],
              generation_config=generation_config,
      )

      ## postprocessing depends on how data is outputted
      q_a_data = response.text.replace('jsonl','').replace("```",'').strip().splitlines()

      # Increment the iteration
      iteration += 1
      logger.info(f'Iteration {iteration}, Amount of q-a sets generated: {len(q_a_data)} ')



      # Open the file in append mode
      with open(output_path, "a") as f:

          # Iterate over the generator and write each item as a JSON string
          logger.debug('Appending data into file.....')

          for item in q_a_data:
            item = item.strip().rstrip(',')
            breakpoint()
            #error handling
            try:
              # Convert the item to a JSON string
              json_item = json.loads(item)
              dataset_size += 1
              # Write the JSON string to the file
              json.dump(json_item, f)
              f.write("\n")  # Write a newline character to ensure each entry starts on a new line
              logger.info(f"Successfully processs Q-A set: {json_item}")
            except json.JSONDecodeError as e:
              logger.error(f"Error decoding JSON: {e}. Skipping this q-a set: {json.item}")
              continue
            except Exception as e:
              # Handle any other exceptions that may occur
              logger.error(f"Unexpected error: {str(e)}. Skipping this Q&A set.")
              continue  # Skip to the next Q&A set



      # Reopen the file in read mode to check the size
      #dataset_size = size_of_file(file = output_path)


      # End the timer
      end_time = time.time()

      # Calculate the elapsed time
      time_taken = end_time - start_time

      logger.info(f"Final results \n dataset size: {dataset_size}\n time elapse: {time_taken}")


     # Outputting chat metadata?
    #logger.info( f""" \n MetaData:\n{response}  """)

    # Outputting Gemini Response
    #logger.info(f"\n Gemini Ouput:\n{response.text}")

    # TODO:
    #   How do convert string data to json file?
    #     - Split on each newline or "{" then convert that one line into a jsonl then write to a file
    #     - What about the text before the json? Could use replace or strip function
    #   How do I get a full 1000 q-a sets
    #     - Probelm is output token limit. Must loop over calls and append to json file
    #     - Check length of json file or store it in dictionary convert it at end
    #   Prompt limit size
    #     - Needle probelm llm usually considers top and bottom as important
    #     - Chunk the pdf into chapters generate 200 q-a prompts for that chapter
    #     - Append it to the json  file  (20 chapters = 4000  Q-A)
    #
    return response


from pathlib import Path

# Specify the folder path
folder_path = Path('/content/qa-data')

# Specify the file path
output_path = "/content/QA_LPI.jsonl"

# Record the start time
start_time = time.time()
# Loop through each file in the folder
for file_path in folder_path.iterdir():

    logger.info(f'Processing file: {file_path.name}')

    # Encoding text
    encoded_data = base64.b64encode( open(file_path, "rb").read() ).decode("utf-8")


    # Not sure about this can't find documentation
    document = Part.from_data(data=base64.b64decode(encoded_data),
      mime_type="text/plain", # change to application/pdf to process pdfs
    )
    multiturn_generate_content(output_path, num_sets=50)
    # Record the end time
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    logger.info(f"\nFinal results \n dataset size: {size_of_file(output_path)}\n time elapse: {elapsed_time:.2f} Seconds")

[32m2024-10-27 17:21:20.342[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 125>[0m:[36m127[0m - [1mProcessing file: LPI_index.txt[0m
[32m2024-10-27 17:22:51.908[0m | [1mINFO    [0m | [36m__main__[0m:[36mmultiturn_generate_content[0m:[36m48[0m - [1mIteration 1, Amount of q-a sets generated: 43 [0m
[32m2024-10-27 17:22:51.911[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mmultiturn_generate_content[0m:[36m56[0m - [34m[1mAppending data into file.....[0m
[32m2024-10-27 17:22:51.913[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mmultiturn_generate_content[0m:[36m71[0m - [31m[1mError decoding JSON: Expecting value: line 1 column 1 (char 0). Skipping this q-a set[0m
[32m2024-10-27 17:22:51.915[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mmultiturn_generate_content[0m:[36m71[0m - [31m[1mError decoding JSON: Expecting value: line 1 column 2 (char 1). Skipping this q-a set[0m
[32m2024-10-27 17:22:51.919[0m | [31m[1mERROR

## Data Postprocessing


In [None]:
import json
from loguru import logger


# Open the original JSONL file and read the contents
with open("/content/Q_A (1).jsonl", "r") as f:
    lines = f.readlines()

# Create a new list to store the modified lines
modified_lines = []

# Loop through each line, parse it as JSON, modify, and store it
for line in lines:
    try:
        # Parse the line into a dictionary
        data = json.loads(line)

        # Normalize the 'answer' key to account for different cases
        answer_key = "answer" if "answer" in data else "Answer"

        # Modify the dictionary
        new_data = {
            "instruction": data["question"],  # Change 'question' to 'instruction'
            "context": "",                    # Add a blank 'context' field
            "output": data[answer_key]        # Use the normalized 'answer' or 'Answer'
        }

        # Convert the modified dictionary back to a JSON-formatted string
        modified_line = json.dumps(new_data)

        # Append to the list of modified lines
        modified_lines.append(modified_line)

    except KeyError:
        # Skip this entry if the expected key is missing
        logger.warning(f"Skipping entry due to missing 'answer' or 'Answer' key: {line}")
        continue

# Write the modified lines to a new JSONL file
with open("/content/modified_Q_A.jsonl", "w") as f:
    for line in modified_lines:
        f.write(line + "\n")

logger.info("File transformation complete.")

In [None]:
import json

# Open the original JSONL file and read the contents
with open("/content/QA_LPI.jsonl, "r") as f:
    lines = f.readlines()

for line in lines:
  try:

    data = json.loads(line)
    # Normalize the 'answer' key to account for different cases
    answer_key = "answer" if "answer" in data else "Answer"

    new_data = {
        "question": data["question"],  # Change 'question' to 'instruction'                   #
        "answer": data[answer_key]        # Use the normalized 'answer' or 'Answer'
    }

    with open("/content/new_CAD_QA.jsonl", "a") as f:
      json.dump(new_data, f)
      f.write("\n")  # Write a newline character to ensure each entry starts on a new line

  except json.JSONDecodeError:
    logger.warning(f"Skipping entry due to missing json stuff' {line}")

  except KeyError as e:
        logger.warning(f"Skipping entry due to missing 'answer' or 'Answer' key: {line}")
        continue  # Skip this iteration if a key is missing
