Generate questions for AWS SA exam

In [65]:
import requests
from dotenv import load_dotenv
import pymongo
import time
import uuid
import sys
import os
import importlib
import re
import json

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Get the path to the parent directory
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import const
#importlib.reload(const)    #if we update the file const.py

In [56]:
load_dotenv() 
GENERATIVE_URI = os.environ['GENERATIVE_URI']

In [57]:
db_client = pymongo.MongoClient(os.environ['DB_URI'])
db = db_client['db_certificates']
collection = db['tb_aws_sa']

In [58]:
TOTAL_QUESTIONS = 65
DOMAINS = [
    {
        "name": "Secure",
        "percent": 30,
        "num": 2
    },
    # {
    #     "name": "Resilient",
    #     "percent": 26,
    #     "num": 16
    # },
    # {
    #     "name": "High-Performing",
    #     "percent": 24,
    #     "num": 15
    # },
    # {
    #     "name": "Cost-Optimized",
    #     "percent": 20,
    #     "num": 15 
    # }
]

In [59]:
#determin how many questions in each domain
def get_questions_each_domain():
    for domain in DOMAINS:
        no_quest = int(domain['percent'] * TOTAL_QUESTIONS / 100)
        print(no_quest)
#test
#get_questions_each_domain()

In [60]:
#Send a POST request
def post_request_generative_ai(text_prompt):
    HEADER = {'Content-Type': 'application/json'}
    json_data = {
        "contents": [
            { "parts": [
                {"text": text_prompt + " Please provide a response in a structured JSON format."}]
                #{"text": text_prompt + " Please provide a plain string response."}]
            }
        ]
    }

    try:
        r = requests.post(GENERATIVE_URI, headers=HEADER, json=json_data)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [61]:
#insert questions first
def upsert_questions(domain, question):
    existed_doc = collection.find_one({'question': question})
    if existed_doc == None:
        #insert new post
        new_doc = {
            'uuid': const.generate_random_uuid(),
            'domain': domain,
            'question': question,
        }
        collection.insert_one(new_doc)
        print('Inserted new doc')

In [66]:
def parse_candidate_content(data):
    """
    Parses the content from a dictionary in the 'candidates' list.
    Specifically looks for JSON content within the 'text' part
    and attempts to load it.

    Args:
        data (dict): A dictionary representing an item in the 'candidates' list.

    Returns:
        dict or str or None: If JSON content is found and successfully
                             parsed, it returns the parsed dictionary.
                             If no JSON is found, it returns the raw text.
                             Returns None if the expected structure is not found.
    """
    if not isinstance(data, dict) or 'content' not in data or not isinstance(data['content'], dict) or 'parts' not in data['content'] or not isinstance(data['content']['parts'], list):
        return None

    for part in data['content']['parts']:
        if isinstance(part, dict) and 'text' in part:
            text_content = part['text'].strip()
            # Use regex to find JSON blocks within the text
            json_match = re.search(r'```json\n(.*?)\n```', text_content, re.DOTALL)
            if json_match:
                try:
                    return json.loads(json_match.group(1))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    return text_content  # Return the raw text in case of an error
            elif text_content:
                return text_content  # Return the raw text if no JSON block is found

    return None

def extract_questions_from_candidates(response_data):
    """
    Extracts and parses the 'questions' list from the 'candidates'
    in the given response data.

    Args:
        response_data (dict): The dictionary containing the response data.

    Returns:
        list or None: A list of question dictionaries if found and parsed,
                     otherwise None.
    """
    if not isinstance(response_data, dict) or 'candidates' not in response_data or not isinstance(response_data['candidates'], list):
        return None

    for candidate in response_data['candidates']:
        parsed_content = parse_candidate_content(candidate)
        if isinstance(parsed_content, dict) and 'questions' in parsed_content and isinstance(parsed_content['questions'], list):
            return parsed_content['questions']

    return None

# Example usage with your provided data:
#response_data = {'candidates': [{'content': {'parts': [{'text': '```json\n{\n  "questions": [\n    {\n      "question": "Your company is launching a new e-commerce platform on AWS.  The platform will handle sensitive customer data, including credit card information and personally identifiable information (PII). You need to design a secure architecture that meets PCI DSS compliance requirements.  Describe a comprehensive approach to securing the application, covering data at rest, data in transit, and access control. Consider the use of specific AWS services and explain your rationale for choosing them.  Focus on practical implementation details rather than just naming services.",\n      "topics": ["Data Security", "PCI DSS Compliance", "IAM", "KMS", "VPC", "Security Groups", "WAF", "S3", "Encryption"],\n      "difficulty": "Hard"\n    },\n    {\n      "question": "A client has an existing on-premises application that needs to be migrated to AWS. The application interacts with a legacy database that contains highly sensitive customer records. During the migration, you must ensure that the database remains secure and complies with data sovereignty regulations for Europe (GDPR).  How would you design the migration strategy to minimize downtime and maintain security throughout the process?  Be specific about your choices of AWS services and how you will address data encryption, network security, and compliance.  Consider potential challenges and mitigation strategies.",\n      "topics": ["Data Sovereignty", "GDPR", "Database Migration", "VPN", "Direct Connect", "RDS", "Database Encryption", "IAM Roles", "Network Security", "Disaster Recovery"],\n      "difficulty": "Medium"\n    }\n  ]\n}\n```\n'}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.3347730217091848}], 'usageMetadata': {'promptTokenCount': 55, 'candidatesTokenCount': 341, 'totalTokenCount': 396, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 55}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 341}]}, 'modelVersion': 'gemini-1.5-flash'}

In [68]:
def generate_list_of_questions():
    no_of_questions = 0
    for domain in DOMAINS:
        #domain['num'] = 1 #testing
        text_prompt = "Generate "+str(domain['num'])+" multiple-choice questions for me to practice AWS certified solution architect - associate exam in the domain: Design "+domain['name']+" architecture. I want questions that focus more on real-world scenarios and problem-solving rather than just definitions of services."
        raw_generated_text = post_request_generative_ai(text_prompt)
        print(raw_generated_text)
        questions = extract_questions_from_candidates(raw_generated_text)
        if questions:
            print("Extracted Questions:")
            for q in questions:
                print(q)
        else:
            print("No questions found in the parsed content.")
        #    
        if 'candidates' in raw_generated_text:
            question = raw_generated_text['candidates'][0]['content']['parts'][0]['text']
            question = question.replace('  ', ' ').replace('\n', '')
            #upsert_questions(domain['name'], question)
            no_of_questions += 1
    print(no_of_questions)
#test
generate_list_of_questions()

{'candidates': [{'content': {'parts': [{'text': '```json\n{\n  "questions": [\n    {\n      "question": "Your company is migrating a legacy on-premises application to AWS. This application processes sensitive customer data, including Personally Identifiable Information (PII) and financial data.  It requires high availability and needs to comply with PCI DSS standards. Which of the following architectural designs best addresses these requirements?",\n      "options": [\n        "A. Deploy the application on EC2 instances in a single Availability Zone, using security groups for network access control and encrypting data at rest with EBS encryption.",\n        "B. Deploy the application on EC2 instances across multiple Availability Zones within a single Region, utilizing an Elastic Load Balancer for high availability, implementing encryption at rest and in transit with KMS and SSL/TLS, and configuring appropriate IAM roles and policies.",\n        "C. Deploy the application as a serverles

In [63]:
#export to Udemy CSV
#Question,Question Type,Answer Option 1,Explanation 1,Answer Option 2,Explanation 2,Answer Option 3,Explanation 3,Answer Option 4,Explanation 4,Answer Option 5,Explanation 5,Answer Option 6,Explanation 6,Correct Answers,Overall Explanation,Domain