Generate questions for AWS SA exam

In [1]:
import requests
from dotenv import load_dotenv
import pymongo
import time
import uuid
import sys
import os
import importlib
import re
import json
import csv

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Get the path to the parent directory
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import const
#importlib.reload(const)    #if we update the file const.py

In [2]:
load_dotenv() 
GENERATIVE_URI = os.environ['GENERATIVE_URI']

In [3]:
db_client = pymongo.MongoClient(os.environ['DB_URI'])
db = db_client['db_certificates']
collection = db['tb_aws_sa']

In [4]:
AWS_SERVICES = []

In [5]:
TOTAL_QUESTIONS = 65
DOMAINS = [
    {
        "name": "Secure",
        "percent": 30,
        "num": 19
        
    },
    {
        "name": "Resilient",
        "percent": 26,
        "num": 16
    },
    {
        "name": "High-Performing",
        "percent": 24,
        "num": 15
    },
    {
        "name": "Cost-Optimized",
        "percent": 20,
        "num": 15 
    }
]

In [6]:
#determin how many questions in each domain
def get_questions_each_domain():
    for domain in DOMAINS:
        no_quest = int(domain['percent'] * TOTAL_QUESTIONS / 100)
        print(no_quest)
#test
#get_questions_each_domain()

In [20]:
#Send a POST request
def post_request_generative_ai(text_prompt):
    HEADER = {'Content-Type': 'application/json'}
    json_data = {
        "contents": [
            { "parts": [
                #{"text": text_prompt + " Please provide a response in a structured JSON format, including all explanations for each option as JSON format. Each explanation should have 40 to 60 words."}]
                {"text": text_prompt + " Please provide a plain string response."}]
            }
        ]
    }

    try:
        r = requests.post(GENERATIVE_URI, headers=HEADER, json=json_data)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [8]:
#insert questions first
def insert_questions(json_question):
    existed_doc = collection.find_one({'question': json_question['question']})
    if existed_doc == None:
        #insert new document
        collection.insert_one(json_question)
        #print('Inserted new doc')

In [9]:
def parse_candidate_content(data):
    """
    Parses the content from a dictionary in the 'candidates' list.
    Specifically looks for JSON content within the 'text' part
    and attempts to load it.

    Args:
        data (dict): A dictionary representing an item in the 'candidates' list.

    Returns:
        dict or str or None: If JSON content is found and successfully
                             parsed, it returns the parsed dictionary.
                             If no JSON is found, it returns the raw text.
                             Returns None if the expected structure is not found.
    """
    if not isinstance(data, dict) or 'content' not in data or not isinstance(data['content'], dict) or 'parts' not in data['content'] or not isinstance(data['content']['parts'], list):
        return None

    for part in data['content']['parts']:
        if isinstance(part, dict) and 'text' in part:
            text_content = part['text'].strip()
            # Use regex to find JSON blocks within the text
            json_match = re.search(r'```json\n(.*?)\n```', text_content, re.DOTALL)
            if json_match:
                try:
                    return json.loads(json_match.group(1))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    return text_content  # Return the raw text in case of an error
            elif text_content:
                return text_content  # Return the raw text if no JSON block is found

    return None

def extract_questions_from_candidates(response_data):
    """
    Extracts and parses the 'questions' list from the 'candidates'
    in the given response data.

    Args:
        response_data (dict): The dictionary containing the response data.

    Returns:
        list or None: A list of question dictionaries if found and parsed,
                     otherwise None.
    """
    if not isinstance(response_data, dict) or 'candidates' not in response_data or not isinstance(response_data['candidates'], list):
        return None

    for candidate in response_data['candidates']:
        parsed_content = parse_candidate_content(candidate)
        if isinstance(parsed_content, dict) and 'questions' in parsed_content and isinstance(parsed_content['questions'], list):
            return parsed_content['questions']

    return None

# Example usage with your provided data:
#response_data = {'candidates': [{'content': {'parts': [{'text': '```json\n{\n  "questions": [\n    {\n      "question": "Your company is launching a new e-commerce platform on AWS.  The platform will handle sensitive customer data, including credit card information and personally identifiable information (PII). You need to design a secure architecture that meets PCI DSS compliance requirements.  Describe a comprehensive approach to securing the application, covering data at rest, data in transit, and access control. Consider the use of specific AWS services and explain your rationale for choosing them.  Focus on practical implementation details rather than just naming services.",\n      "topics": ["Data Security", "PCI DSS Compliance", "IAM", "KMS", "VPC", "Security Groups", "WAF", "S3", "Encryption"],\n      "difficulty": "Hard"\n    },\n    {\n      "question": "A client has an existing on-premises application that needs to be migrated to AWS. The application interacts with a legacy database that contains highly sensitive customer records. During the migration, you must ensure that the database remains secure and complies with data sovereignty regulations for Europe (GDPR).  How would you design the migration strategy to minimize downtime and maintain security throughout the process?  Be specific about your choices of AWS services and how you will address data encryption, network security, and compliance.  Consider potential challenges and mitigation strategies.",\n      "topics": ["Data Sovereignty", "GDPR", "Database Migration", "VPN", "Direct Connect", "RDS", "Database Encryption", "IAM Roles", "Network Security", "Disaster Recovery"],\n      "difficulty": "Medium"\n    }\n  ]\n}\n```\n'}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.3347730217091848}], 'usageMetadata': {'promptTokenCount': 55, 'candidatesTokenCount': 341, 'totalTokenCount': 396, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 55}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 341}]}, 'modelVersion': 'gemini-1.5-flash'}

In [10]:
def generate_list_of_questions():
    no_of_questions = 0
    for domain in DOMAINS:
        #domain['num'] = 1 #testing
        text_prompt = "Generate "+str(domain['num'])+" multiple-choice questions for me to practice AWS certified solution architect - associate exam in the domain: Design "+domain['name']+" architecture. I want questions that focus more on real-world scenarios and problem-solving rather than just definitions of services."
        raw_generated_text = post_request_generative_ai(text_prompt)
        
        questions = extract_questions_from_candidates(raw_generated_text)
        if questions:
            #parse questions and answers
            for q in questions:
                q['domain'] = domain['name']
                q['exported'] = 0
                q['uuid'] = const.generate_random_uuid()
                #print(q)
                insert_questions(q)
                no_of_questions += 1
        else:
            print(raw_generated_text)
            print("No questions found in the parsed content: " + domain['name'])
        print(domain['name'] + ': ' + str(no_of_questions))
#test
#for i in range(4):
    #generate_list_of_questions()    #approx. 1m22s for 1 loop

In [11]:
#map A - D to 1 - 4
def map_index(a_char):
    if a_char == 'A':
        return 1
    if a_char == 'B':
        return 2
    if a_char == 'C':
        return 3
    if a_char == 'D':
        return 4

In [None]:
#export to the CSV file, each file has 65 questions
#Question,Question Type,Answer Option 1,Explanation 1,Answer Option 2,Explanation 2,Answer Option 3,Explanation 3,Answer Option 4,Explanation 4,Answer Option 5,Explanation 5,Answer Option 6,Explanation 6,Correct Answers,Overall Explanation,Domain
def export_csv(filename):
    #get questions that not exported yet. Note that: each part must follow by domain percents
    file_data = []
    #append header line
    file_data.append(['Question','Question Type','Answer Option 1','Explanation 1','Answer Option 2','Explanation 2','Answer Option 3','Explanation 3','Answer Option 4','Explanation 4','Answer Option 5','Explanation 5','Answer Option 6','Explanation 6','Correct Answers','Overall Explanation','Domain'])
        
    for domain in DOMAINS:
        #todo: get random questions
        pipeline = [
            {"$match": {'exported': 0, 'domain': domain['name']}},
            {"$sample": {"size": domain['num']}}
        ]
        random_documents = list(collection.aggregate(pipeline))
        #print(random_documents)
        for doc in random_documents:
            file_data.append([doc['question'].replace('  ', ' ').replace('\n', ''), 'multiple-choice', 
                              doc['options']['A'], doc['explanation']['A'],     #A
                              doc['options']['B'], doc['explanation']['B'],     #B
                              doc['options']['C'], doc['explanation']['C'],     #C
                              doc['options']['D'], doc['explanation']['D'],     #D
                              '', '',   #5
                              '', '',   #6
                              map_index(doc['answer']), #correct answer
                              '', #overall
                              doc['domain'] #domain
                              ])
            #update this doc is exported
            collection.update_one({'uuid': doc['uuid']}, {'$set': {'exported': 1, 'filename': filename}})
    #save all questions to csv
    try:
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(file_data)
        print(f"Data successfully saved to '{filename}'")
    except Exception as e:
        print(f"An error occurred while saving the array: {e}")
    #
#test
#for i in range(2,6):
    #export_csv('aws_sa_test_'+str(i+1)+'_20250504.csv')

Data successfully saved to 'aws_sa_test_3_20250504.csv'
Data successfully saved to 'aws_sa_test_4_20250504.csv'
Data successfully saved to 'aws_sa_test_5_20250504.csv'
Data successfully saved to 'aws_sa_test_6_20250504.csv'


In [35]:
#reset doc from file #3
def reset_data():
    docs = collection.find({'exported': 1, 'filename': {'$in': ['aws_sa_test_3_20250504.csv','aws_sa_test_4_20250504.csv','aws_sa_test_5_20250504.csv','aws_sa_test_6_20250504.csv']}})
    i = 0
    for doc in docs:
        if i < 600:
            has_error = False
            #for key in doc['explanation'].keys():
                    #revise the text
                    #result = post_request_generative_ai('Revise this sentence to 40 to 60 words: ' + doc['explanation'][key])
                    #print(result)
                    # try:
                    #     doc['explanation'][key] = result['candidates'][0]['content']['parts'][0]['text'].replace('  ', ' ').replace('\n', '')
                    # except:
                    #     #cannot revise this explanation
                    #     has_error = True
                    #     break
            if not has_error:
                collection.update_one({'uuid': doc['uuid']}, {'$set': {'exported': 0, 'filename': ''}})
                print(str(i) + ': ' + doc['uuid'])  #revised doc
        else:
             break
        i += 1
        
#test
#reset_data()