In [1]:
import requests
from dotenv import load_dotenv
import pymongo
import time
import uuid
import sys
import os
import importlib
import re
import json
import csv

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Get the path to the parent directory
parent_dir = os.path.dirname(notebook_dir)

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import const
#importlib.reload(const)    #if we update the file const.py

In [2]:
load_dotenv() 
GENERATIVE_URI = os.environ['GENERATIVE_URI']

In [3]:
db_client = pymongo.MongoClient(os.environ['DB_URI'])
db = db_client['db_certificates']
collection = db['tb_psm_1']

In [4]:
#Send a POST request
def post_request_generative_ai(text_prompt):
    HEADER = {'Content-Type': 'application/json'}
    json_data = {
        "contents": [
            { "parts": [
                {"text": text_prompt}]
            }
        ]
    }

    try:
        r = requests.post(GENERATIVE_URI, headers=HEADER, json=json_data)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [5]:
#insert questions first
def insert_questions(json_question):
    if 'question' in json_question:
        existed_doc = collection.find_one({'question': json_question['question']})
        if existed_doc == None:
            #insert new document
            collection.insert_one(json_question)
            #print('Inserted new doc')

In [6]:
#map A - D to 1 - 4
def map_index(a_char):
    if a_char == 'A':
        return 1
    if a_char == 'B':
        return 2
    if a_char == 'C':
        return 3
    if a_char == 'D':
        return 4
    if a_char == 'E':
        return 5

In [7]:
def parse_candidate_content(data):
    """
    Parses the content from a dictionary in the 'candidates' list.
    Specifically looks for JSON content within the 'text' part
    and attempts to load it.

    Args:
        data (dict): A dictionary representing an item in the 'candidates' list.

    Returns:
        dict or str or None: If JSON content is found and successfully
                             parsed, it returns the parsed dictionary.
                             If no JSON is found, it returns the raw text.
                             Returns None if the expected structure is not found.
    """
    if not isinstance(data, dict) or 'content' not in data or not isinstance(data['content'], dict) or 'parts' not in data['content'] or not isinstance(data['content']['parts'], list):
        return None

    for part in data['content']['parts']:
        if isinstance(part, dict) and 'text' in part:
            text_content = part['text'].strip()
            # Use regex to find JSON blocks within the text
            json_match = re.search(r'```json\n(.*?)\n```', text_content, re.DOTALL)
            if json_match:
                try:
                    return json.loads(json_match.group(1))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    return text_content  # Return the raw text in case of an error
            elif text_content:
                return text_content  # Return the raw text if no JSON block is found

    return None

def extract_questions_from_candidates(response_data):
    """
    Extracts and parses the 'questions' list from the 'candidates'
    in the given response data.

    Args:
        response_data (dict): The dictionary containing the response data.

    Returns:
        list or None: A list of question dictionaries if found and parsed,
                     otherwise None.
    """
    if not isinstance(response_data, dict) or 'candidates' not in response_data or not isinstance(response_data['candidates'], list):
        return None

    for candidate in response_data['candidates']:
        parsed_content = parse_candidate_content(candidate)
        if isinstance(parsed_content, dict) and 'questions' in parsed_content and isinstance(parsed_content['questions'], list):
            return parsed_content['questions']

    return None

# Example usage with your provided data:
#response_data = {'candidates': [{'content': {'parts': [{'text': '```json\n{\n  "questions": [\n    {\n      "question": "Your company is launching a new e-commerce platform on AWS.  The platform will handle sensitive customer data, including credit card information and personally identifiable information (PII). You need to design a secure architecture that meets PCI DSS compliance requirements.  Describe a comprehensive approach to securing the application, covering data at rest, data in transit, and access control. Consider the use of specific AWS services and explain your rationale for choosing them.  Focus on practical implementation details rather than just naming services.",\n      "topics": ["Data Security", "PCI DSS Compliance", "IAM", "KMS", "VPC", "Security Groups", "WAF", "S3", "Encryption"],\n      "difficulty": "Hard"\n    },\n    {\n      "question": "A client has an existing on-premises application that needs to be migrated to AWS. The application interacts with a legacy database that contains highly sensitive customer records. During the migration, you must ensure that the database remains secure and complies with data sovereignty regulations for Europe (GDPR).  How would you design the migration strategy to minimize downtime and maintain security throughout the process?  Be specific about your choices of AWS services and how you will address data encryption, network security, and compliance.  Consider potential challenges and mitigation strategies.",\n      "topics": ["Data Sovereignty", "GDPR", "Database Migration", "VPN", "Direct Connect", "RDS", "Database Encryption", "IAM Roles", "Network Security", "Disaster Recovery"],\n      "difficulty": "Medium"\n    }\n  ]\n}\n```\n'}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.3347730217091848}], 'usageMetadata': {'promptTokenCount': 55, 'candidatesTokenCount': 341, 'totalTokenCount': 396, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 55}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 341}]}, 'modelVersion': 'gemini-1.5-flash'}

In [28]:
#PSM I: 80 questions (70 multi choice, 5 multi select, 5 true false)
def generate_questions():
    context = 'PSM (Professional Scrum Master) I includes questions from the following Focus Areas as defined in the Professional Scrum Competencies: "Understanding and Applying the Scrum Framework: Empiricism, Scrum Values, Scrum Team, Events, Artifacts, Done. Developing People and Teams: Self-Managing Teams, Facilitation, Coaching. Managing Products with Agility: Forecasting & Release Planning, Product Value, Product Backlog Management, Stakeholders & Customers."'
    #multiple choice
    #text_prompt = 'Generate 10 high-quality multiple-choice questions with answers and explanations for the PSM I examination. Each question has more than 60 words in length and should focus more on complex real-world scenarios rather than definitions. Please provide a response in a structured JSON format with the key name "questions", including all explanations for each answer as a JSON object. Each explanation has more than 50 words. Sample response structure should like this: { "question" : "xxx", "options" : { "A" : "a text", "B" : "a text", "C" : "a text", "D" : "a text"},"answer" : "B","explanation" : { "A" : "a text", "B" : "a text", "C" : "a text", "D" : "a text"},"type" : "multiple-choice"}. Response should avoid error while parsing JSON format "Error decoding JSON: Expecting property name enclosed in double quotes".'
    #true/false
    #text_prompt = 'Generate 4 high-quality true-false questions (50% true, 50% false) with answers and explanations for the PSM I examination. Each question has more than 60 words in length and should focus more on complex real-world scenarios rather than definitions. Please provide a response in a structured JSON format with the key name "questions", including all explanations for each answer as a JSON object. Each explanation has more than 50 words. Sample response structure should like this: { "question" : "xxx", "options" : { "A" : "True", "B" : "False"},"answer" : "A","explanation" : { "A" : "a text", "B" : "a text"},"type" : "true-false"}. Response should avoid error while parsing JSON format "Error decoding JSON: Expecting property name enclosed in double quotes".'
    #multi selection
    text_prompt = 'Generate 6 high-quality multiple-selection questions with answers and explanations for the PSM I examination. Each question has more than 60 words in length and should focus more on complex real-world scenarios rather than definitions. Please provide a response in a structured JSON format with the key name "questions", including all explanations for each answer as a JSON object. Each explanation has more than 50 words. Sample response structure should like this: { "question" : "xxx", "options" : { "A" : "a text", "B" : "a text", "C" : "a text", "D" : "a text", "E": "a_text"},"answer" : [],"explanation" : { "A" : "a text", "B" : "a text", "C" : "a text", "D" : "a text", "E" : "a text"},"type" : "multiple-selection"}. "answer" should have 2 to 4 correct elements.Response should avoid error while parsing JSON format "Error decoding JSON: Expecting property name enclosed in double quotes".'
    #
    raw_generated_text = post_request_generative_ai(context + text_prompt)

    #print(raw_generated_text)
    questions = extract_questions_from_candidates(raw_generated_text)
    if questions:
        #parse questions and answers
        for q in questions:
            q['exported'] = 0
            q['uuid'] = const.generate_random_uuid()
            #print(q)
            insert_questions(q)
    else:
        print(raw_generated_text)
        print("No questions found in the parsed content")
#test
for i in range(10):
    #generate_questions() #10 sentences 30 secs
    print('===== Finish loop: ' + str(i))

===== Finish loop: 0
===== Finish loop: 1
===== Finish loop: 2
===== Finish loop: 3
===== Finish loop: 4
===== Finish loop: 5
===== Finish loop: 6
===== Finish loop: 7
===== Finish loop: 8
===== Finish loop: 9


In [18]:
#unify fields in all documents
def unify_fields():
    #questionType, question_type -> type
    # correctAnswer, correctAnswers -> answer
    # answers -> options
    all_docs = collection.find({'exported':0, 'type':'multiple-choice', 'explanation':None})
    for doc in all_docs:
        hasUpdated = False
        update_doc = {}
        #print(doc['uuid'])
        # if 'type' not in doc:
        #     if 'questionType' in doc:
        #         hasUpdated = True
        #         update_doc['type'] = doc['questionType']
        #     elif 'question_type' in doc:
        #         hasUpdated = True
        #         update_doc['type'] = doc['question_type']
        #     else:
        #         hasUpdated = True
        #         update_doc['type'] = 'multiple-choice'
        # if 'answer' not in doc:
        #     if 'correctAnswer' in doc:
        #         hasUpdated = True
        #         update_doc['answer'] = doc['correctAnswer']
        #     elif 'correctAnswers' in doc:
        #         hasUpdated = True
        #         update_doc['answer'] = doc['correctAnswers']
        # if 'options' not in doc:
        #     if 'answers' in doc:
        #         hasUpdated = True
        #         update_doc['options'] = doc['answers']

        #if hasUpdated:
        #print('a')
        #collection.update_one({'uuid': doc['uuid']}, {'$set': {'explanation': doc['explanations']}})
    #update 2
    # all_docs = collection.find({'filename': 'aws_pmp_test_6_20250506.csv'})
    # for doc in all_docs:
    #     collection.update_one({'uuid': doc['uuid']}, {'$set': {'exported': 0, 'filename': ''}})

#test
#unify_fields()

In [35]:
#Question,Question Type,Answer Option 1,Explanation 1,Answer Option 2,Explanation 2,Answer Option 3,Explanation 3,Answer Option 4,Explanation 4,Answer Option 5,Explanation 5,Answer Option 6,Explanation 6,Correct Answers,Overall Explanation,Domain
def export_csv(path, filename):
    #get questions that not exported yet. Note that: each part must follow by domain percents
    file_data = []
    #append header line (both multi-choice and multi-selection)
    file_data.append(['Question','Question Type','Answer Option 1','Explanation 1','Answer Option 2','Explanation 2','Answer Option 3','Explanation 3','Answer Option 4','Explanation 4','Answer Option 5','Explanation 5','Answer Option 6','Explanation 6','Correct Answers','Overall Explanation','Domain'])
    exported_uuid = []
    manual_uuid = []
    #1. export multiple-choice first
    pipeline = [
                {"$match": {'exported': 0, 'type': 'multiple-choice'}},
                {"$sample": {"size": 72}}
            ]
    random_documents = list(collection.aggregate(pipeline))
    for doc in random_documents:
        file_data.append([doc['question'].replace('  ', ' ').replace('\n', ''), 'multiple-choice', 
                                  doc['options']['A'], doc['explanation']['A'].replace('  ', ' ').replace('\n', ''),     #A
                                  doc['options']['B'], doc['explanation']['B'].replace('  ', ' ').replace('\n', ''),     #B
                                  doc['options']['C'], doc['explanation']['C'].replace('  ', ' ').replace('\n', ''),     #C
                                  doc['options']['D'], doc['explanation']['D'].replace('  ', ' ').replace('\n', ''),     #D
                                  '', '',   #E
                                  '', '',   #6
                                  map_index(doc['answer']), #correct answer
                                  '', #overall
                                  '' #domain
                                  ])
        exported_uuid.append(doc['uuid'])
    #2. true false questions
    pipeline = [
                {"$match": {'exported': 0, 'type': 'true-false'}},
                {"$sample": {"size": 4}}
            ]
    random_documents = list(collection.aggregate(pipeline))
    for doc in random_documents:
        file_data.append([doc['question'].replace('  ', ' ').replace('\n', ''), 'multiple-choice', 
                                  doc['options']['A'], doc['explanation']['A'].replace('  ', ' ').replace('\n', ''),     #A
                                  doc['options']['B'], doc['explanation']['B'].replace('  ', ' ').replace('\n', ''),     #B
                                  '', '',     #C
                                  '', '',     #D
                                  '', '',   #E
                                  '', '',   #6
                                  map_index(doc['answer']), #correct answer
                                  '', #overall
                                  '' #domain
                                  ])
        exported_uuid.append(doc['uuid'])
    #3. multi selection
    pipeline = [
                {"$match": {'exported': 0, 'type': 'multiple-selection'}},
                {"$sample": {"size": 4}}
            ]
    random_documents = list(collection.aggregate(pipeline))
    for doc in random_documents:
        exported_uuid.append(doc['uuid'])
        manual_uuid.append(doc['uuid']) #they do not suppor bulk upload this type of question, we need to manually add them
    #
    print('","'.join(manual_uuid))
    #save all questions to csv
    try:
        with open(path + filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(file_data)
            print(f"Data successfully saved to '{filename}'")
            for _id in exported_uuid:
                collection.update_one({'uuid': _id}, {'$set': {'exported': 1, 'filename': filename}})
    except Exception as e:
        print(f"An error occurred while saving the array: {e}")
    
#test|
export_csv('./psm_1_data/', 'aws_psm_I_test_6_20250506.csv')

40b715ad-4ec8-47d9-87c8-430bcbbc1e65","cdfc62cc-c8fc-4ca4-8a1f-c24f89f6246f","d09fdd5e-02d7-46ae-9d8f-c3d40439094f","dc42598c-059f-43ce-b970-d1c03ada98f0
Data successfully saved to 'aws_psm_I_test_6_20250506.csv'
