In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Import data

In [2]:
import pandas as pd

all_codes = pd.read_csv('./data/compatible_knowledge.csv')

knowledge_types = pd.read_csv('./data/knowledge_types.csv')

In [3]:
# Read in the courses
courses = pd.read_csv('./data/courses.csv')
courses.head()

Unnamed: 0,Course,Textbook
0,CS 1114 – Intro to Software Design,OpenDSA CS 1114 Online Textbook – Intro to Sof...
1,CS 2114 – Software Design & Data Structures,OpenDSA CS 2114 Online Textbook – Software Des...
2,CS 2104 – Intro to Problem-Solving for CS,"Whimbey, A., Lochhead, J., & Narode, R. (2013)..."
3,CS 2505 – Intro to Computer Organization I,"Patt, Y. N., & Patel, S. J. (2004). Introducti..."
4,CS 2506 – Intro to Computer Organization II,"Patterson, D. A., & Hennessy, J. L. (2021). Co..."


In [4]:
# Read in the prompt template
prompt_template = ''
with open('./prompts/generate_example_1.txt', 'r') as f:
    prompt_template = f.read()
prompt_template

'You are an expert engineering educator with extensive experience in curriculum design across various engineering disciplines. Your task is to generate a realistic textbook-style excerpt that exemplify one or more specific types of knowledge within a given engineering subject and topic. Your output will be in JSON format for easy parsing and processing.\nUse the following inputs to guide your content generation:\nSubject: {subject}\nTopic: {topic}\nKnowledge Types:\n{knowledge_types}\nCreate a textbook-like excerpt of approximately 100-150 words that demonstrates one or more of the specified knowledge types within the context of the given subject and topic. The excerpt should be substantive and realistic, as if taken from an actual engineering textbook.\nGuidelines for creating the excerpts:\n1. Ensure that each excerpt clearly represents at least one of the specified knowledge types, but may include multiple types if appropriate.\n2. Maintain authenticity by using appropriate technica

In [5]:
MODEL = 'mlx-community/Qwen2.5-7B-Instruct-8bit'

In [6]:
# Use requests to send a request at localhost:1234 post to /v1/chat/completions
import requests
import json

def generate_response(prompt):
    url = 'http://localhost:1234/v1/chat/completions'
    headers = {'Content-Type': 'application/json'}
    data = {
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    return response.json()['completions'][0]['content']

## Setup

In [7]:
# Define the generate_prompt function
def generate_prompt(subject, topic, codes):
    codes = codes[0]
    parts = [part.strip() for part in codes.split(',')]
    output = ''
    for i, part in enumerate(parts):
        output += f'<CODE{i+1}>{part}</CODE{i+1}>\n'
        definition = knowledge_types[knowledge_types['Abbrev'] == part]['Definition'].values[0]
        output += f'<DEF{i+1}>{definition}</DEF{i+1}>\n'
    prompt = prompt_template.format(subject=subject, topic=topic, knowledge_types=output)
    return prompt

In [8]:
import json

def extract_examples(response):
    response_text = response['message']['content']
    response_text = response_text.replace('```json', '').replace('```', '')

    # Check if response_text is not empty
    if response_text:
        try:
            # Parse the JSON content
            parsed_content = json.loads(response_text)
            return parsed_content['example']
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")
            return []
    else:
        print("Response content is empty")
        return []

## Main

In [9]:
all_codes_list = list(all_codes.itertuples(index=False, name=None))

In [10]:
import time
import random

def generate_examples_not_parallel(subject, topic, n_gen):
    generated_examples = []
    start_time = time.time()

    for i in range(n_gen):
        selected_codes = random.choice(all_codes_list)
        prompt = generate_prompt(subject, topic, selected_codes)
        response = generate_response(prompt)
        example = extract_examples(response)
        generated_examples.append({
            'Subject': subject,
            'Topic': topic,
            'Example': example,
            'Codes': selected_codes[0]
        })

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds; {elapsed_time / 60:.2f} minutes for {n_gen} examples")

    return generated_examples

In [11]:
import time
import concurrent.futures
import random

def generate_examples_parallel(subject, topic, n_gen):
    generated_examples = []
    start_time = time.time()

    def process_row(row):
        prompt = generate_prompt(subject, topic, row)
        response = generate_response(prompt)
        print(response)
        example = extract_examples(response)
        return {
            'Subject': subject,
            'Topic': topic,
            'Example': example,
            'Codes': row[0]
        }

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # print(f"Number of executors: {executor._max_workers}")
        futures = [executor.submit(process_row, random.choice(all_codes_list)) for _ in range(n_gen)]
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            generated_examples.append(result)

    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds; {elapsed_time / 60:.2f} minutes for {n_gen} examples")

    return generated_examples


In [12]:
def convert_to_snake_case(text):
    return text.lower().replace(" ", "_")

In [16]:
PARALLEL = False
N_GEN = 1
SUBJECT = 'Computer Science'

for index, row in courses.iterrows():
    course = row['Course']
    topic_start = course.find("–")
    topic = course[topic_start + 1:].strip()
    print(f"Generating examples for {topic}")
    if PARALLEL:
        examples = generate_examples_parallel(SUBJECT, topic, N_GEN)
    else:
        examples = generate_examples_not_parallel(SUBJECT, topic, N_GEN)

    examples = pd.DataFrame(examples)
    snake_case_topic = convert_to_snake_case(topic)
    examples.to_csv(f'./new_samples.csv', index=False)
    # examples.to_csv(f'./output/examples_{snake_case_topic}.csv', index=False)
    # print(f"Saved examples to examples_{snake_case_topic}.csv")

Generating examples for Intro to Software Design


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))