In [1]:
# imports

import os
import requests
import json
import time
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
from IPython.display import Markdown, display
from openai import OpenAI

## Connecting to OpenAI (or Ollama in case of open source)

The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI.  

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Specify OpenAI model
# MODEL = "gpt-4.1-nano-2025-04-14"
MODEL = "gpt-5-nano"

# Check the key

if not api_key:
    print("No API key was found - please troubleshoot to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


## Create system and user prompts to input to the model
Models like GPT4.1 have been trained to receive instructions in a particular way. They expect to receive:

**A system prompt** that tells them what task they are performing and what tone they should use

**A user prompt** -- the conversation starter that they should reply to

In [3]:
# Define our system prompt

system_prompt = """You are an expert topic classifier for student feedback from a vocational education and training institution.
Your task is to analyze student verbatims and assign one or more topics from a predefined list.\
You must adhere to the following rules:
1.  Match the verbatim to the most relevant topics from the provided topic list.
2.  If the verbatim is not relevant to any topic on the list, return 'No Match'.
3.  You can assign more than one topic if the verbatim covers multiple subjects.
4.  Your output must be a single JSON object.
5.  The JSON object must have two keys: 'topics' and 'verbatim_text'.
6.  The value for 'topics' should be a list of strings. Each string must be a topic from the provided list or the string 'No Match'.
7.  The value for 'verbatim_text' should be the original verbatim you are analyzing.

Below is a list of predefined topics and five examples of how to classify a verbatim.

**Topic List:**
- Enrolment Process
- Student Support Services
- Course Content and Relevance
- Trainer Quality and Engagement
- Facilities and Campus Environment
- Timetable and Scheduling
- Online Learning Platform
- Assessment and Feedback
- Career and Employment Services
- Technology and Equipment
- Communication and Information
- Student Welfare and Wellbeing
- Course Fees and Payments
- Recognition of Prior Learning (RPL)
- Work Placement
- Graduation and Completion"""


In [4]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(verbatim):
    user_prompt = f"""
    You are looking at a verbatim from a student. Based on the list of topics provided, below are five examples of how to classify a verbatim.

    **Examples for Few-Shot Classification:**

    1. **Verbatim:** "I'm having trouble with the login for the online portal, and the Wi-Fi on campus is really slow."
       **Topics:** ["Online Learning Platform", "Technology and Equipment"]

    2. **Verbatim:** "John is great! He explains everything clearly and is always available to help after class."
       **Topics:** ["Trainer Quality and Engagement"]

    3. **Verbatim:** "I asked about my results from last semester, but nobody has gotten back to me. I've been waiting for weeks."
       **Topics:** ["Communication and Information", "Assessment and Feedback"]

    4. **Verbatim:** "The campus cafeteria has really limited options, and the library hours are not great for students who work."
       **Topics:** ["Facilities and Campus Environment"]

    5. **Verbatim:** "I've been working as a mechanic for 10 years, and I want to see if I can get credit for my experience towards this course."
       **Topics:** ["Recognition of Prior Learning (RPL)"]

    **New Verbatim to Classify:**
    {verbatim}
    """
    
    return user_prompt

**Create Message for the model** : The API from OpenAI expects to receive messages in a particular structure.

```python
[
    {"role": "system", "content": "system message goes here"},
    {"role": "user", "content": "user message goes here"}
]
```

In [5]:
# Create the message structure
def messages_for(verbatim):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(verbatim)}
    ]

## Bring it together

In [6]:
# Create an openai object instance
openai = OpenAI()

# And now: call the OpenAI API to perform the task.
def return_topics(verbatim):
    # create message format to the LLM and request response
    response = openai.chat.completions.create(
        model = MODEL,
        messages = messages_for(verbatim)
    )
    return response.choices[0].message.content

In [7]:
# A function to display this nicely in the Jupyter output, using markdown
def display_summary(verbatim):
    # Call the summary function that creates the message format and get a response from OpenAI
    topics = return_topics(verbatim)

    # Convert response into markdown in jupyter notebook
    display(Markdown(topics))

In [8]:
# Let's try one out
test_verbatim = "The course material is outdated; we're still learning about software from five years ago.",

# Return json
topics = return_topics(test_verbatim)

# Print result
parsed_output = json.loads(topics)
print(f"Verbatim: {parsed_output['verbatim_text']}")
print(f"Topic(s) Identified: {(', ').join(parsed_output['topics'])}")

Verbatim: The course material is outdated; we're still learning about software from five years ago.
Topic(s) Identified: Course Content and Relevance


## BATCH PROCESSING VERBATIMS

In [9]:
# Create a simple timestamp and create filenames for batch input and output
now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S") # e.g., '20250728_164742'
input_filename = f"batch_in_{timestamp}.jsonl"
output_filename = f"batch_out_{timestamp}.json"

In [10]:
# 1. Create the JSONL file for the batch

def create_batch_input_input_json(verbatim_list):
    # Define a folder to save the input files
    input_folder = "input/topic_model_batch_inputs"
    os.makedirs(input_folder, exist_ok=True)
    input_json_path = os.path.join(input_folder, input_filename)

    with open(input_json_path, "w") as f:
        for i, verbatim in enumerate(verbatim_list):
            # Create the combined prompt for each verbatim
            user_prompt = user_prompt_for(verbatim)

            # Create the message structure for the batch API
            request_body = {
                "model": "gpt-4.1-nano-2025-04-14",
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                "response_format": {"type": "json_object"}
            }

            # Create the full JSON object for the batch file line
            batch_request = {
                "custom_id": f"verbatim_{i+1}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": request_body
            }

            # Write the JSON object to the file, followed by a newline
            f.write(json.dumps(batch_request) + '\n')

    print(f"Batch input file '{input_json_path}' created successfully.")
    
    return input_json_path


In [11]:
# 2. Upload the file to OpenAI
def upload_to_openai(batch_file_path):
    try:
        batch_input_file = openai.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        input_file_id = batch_input_file.id
        print(f"File uploaded with ID: {input_file_id}")
    except Exception as e:
        print(f"An error occurred during file upload: {e}")
        exit()
    
    return input_file_id

In [12]:
# 3. Create a batch job
def create_batch_job(input_file_id):
    try:
        batch_job = openai.batches.create(
            input_file_id=input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h" # Currently the only supported value
        )
        batch_job_id = batch_job.id
        print(f"Batch job created with ID: {batch_job_id}")
    except Exception as e:
        print(f"An error occurred during batch job creation: {e}")
        exit()

    return batch_job_id


In [13]:
# 4. Poll for results (This is an async process)

def poll_for_results(batch_job_id):
    print(f"Waiting for batch job {batch_job_id} to complete...\n")
    i=1
    while True:
        try:
            retrieved_job = openai.batches.retrieve(batch_job_id)
            status = retrieved_job.status
            print(f"Attempt {i} : Current status: {status}")

            if status in ["completed", "failed", "cancelled", "expired"]:
                break

            # Wait for a reasonable amount of time before polling again
            time.sleep(30)
            i+=1
        except Exception as e:
            print(f"An error occurred while retrieving job status: {e}")
            break
            
    return retrieved_job, status

In [15]:
# 5. Download and display results as a DataFrame

def download_batch_output(retrieved_job, status):
    if status == "completed":
        output_file_id = retrieved_job.output_file_id
        if output_file_id:
            # Download the file content
            output_content_bytes = openai.files.content(output_file_id)
            output_content_string = output_content_bytes.text

            # Initialize a list to hold our parsed data
            parsed_results = []

            # Process each line of the JSONL output
            for line in output_content_string.strip().split('\n'):
                try:
                    # Load the full JSON object for the line
                    batch_result = json.loads(line)

                    # Extract the custom_id to link it back to the original verbatim
                    custom_id = batch_result.get('custom_id')

                    # Get the verbatim and the model's response
                    response_content = batch_result['response']['body']['choices'][0]['message']['content']

                    # The response content is a JSON string, so we need to parse it again
                    classification_data = json.loads(response_content)

                    # Append the parsed data to our list
                    parsed_results.append({
                        'id': custom_id,
                        'verbatim': classification_data['verbatim_text'],
                        'topics': classification_data['topics']
                    })

                except (json.JSONDecodeError, KeyError) as e:
                    print(f"Error parsing line: {line}. Error: {e}")
                    continue


            # Define a folder to save the output files
            output_folder = "output/topic_model_batch_outputs"
            os.makedirs(output_folder, exist_ok=True)
            output_json_path = os.path.join(output_folder, output_filename)

            with open(output_json_path, 'w') as f:
                # Use json.dump to write the list of dictionaries to the file
                json.dump(parsed_results, f, indent=4)

            print(f"\nBatch results successfully saved to '{output_json_path}'.\n")
            # -------------------------------------------------------------------------


            # Create the pandas DataFrame from the list of dictionaries
            df = pd.DataFrame(parsed_results)

            # Let's clean up the 'topics' column for better display
            df['topics'] = df['topics'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

            # Display the DataFrame
            print("\n--- Final Results DataFrame ---")
            display(df) # Using display to render the DataFrame nicely in Jupyter/IPython

        else:
            print("Batch job completed but no output file was found.")
    else:
        print(f"Batch job failed or was not completed. Final status: {status}")
        if retrieved_job.error_file_id:
            error_content = openai.files.content(retrieved_job.error_file_id).text
            print("\n--- Error Log ---")
            print(error_content)
    
    return df

## Create test set and call the functions

In [16]:
# Sample list of verbatims
verbatims_to_classify = [
    "My classes are all over the place. I have to come to campus three times a week for just one or two hours each time.",
    "The course material is outdated; we're still learning about software from five years ago.",
    "I tried to get help from the student welfare office, but they were closed.",
    "My final assignment feedback was very vague, and I don't know what to improve on.",
    "I'm not sure if this is the right career path for me after finishing this course.",
    "The computer labs have really old computers, and some don't even work properly.",
    "The process to apply for this course was so confusing and the website kept crashing.",
    "My work placement was very unorganized and I felt like I didn't learn anything.",
    "The person who was meant to help me with my enrolment never got back to me.",
    "The cost of the textbooks is way too high, and I'm not sure if I can afford them.",
    "This feedback is not about any of the topics.",
]

In [17]:
# Calling all functions in sequence for batch processing:

# Step 1: Create input batch json
input_batch_json_filepath = create_batch_input_input_json(verbatims_to_classify)

# Step 2: Upload batch to openai
input_file_id = upload_to_openai(input_batch_json_filepath)

# Step 3: Create batch job
batch_job_id = create_batch_job(input_file_id)

# Step 4: Poll for results (async)
retrieved_job, status = poll_for_results(batch_job_id)

# Step 5: Download batch output
result_df = download_batch_output(retrieved_job, status)

Batch input file 'input/topic_model_batch_inputs/batch_in_20250808_095158.jsonl' created successfully.
File uploaded with ID: file-NMJ6LCa2og86zN2MjjGEHZ
Batch job created with ID: batch_68953c43af608190bed5c16c2d64ff42
Waiting for batch job batch_68953c43af608190bed5c16c2d64ff42 to complete...

Attempt 1 : Current status: validating
Attempt 2 : Current status: in_progress
Attempt 3 : Current status: in_progress
Attempt 4 : Current status: completed

Batch results successfully saved to 'output/topic_model_batch_outputs/batch_out_20250808_095158.json'.


--- Final Results DataFrame ---


Unnamed: 0,id,verbatim,topics
0,verbatim_1,My classes are all over the place. I have to c...,Timetable and Scheduling
1,verbatim_2,The course material is outdated; we're still l...,Course Content and Relevance
2,verbatim_3,I tried to get help from the student welfare o...,Student Support Services
3,verbatim_4,"My final assignment feedback was very vague, a...",Assessment and Feedback
4,verbatim_5,I'm not sure if this is the right career path ...,Career and Employment Services
5,verbatim_6,"The computer labs have really old computers, a...",Facilities and Campus Environment
6,verbatim_7,The process to apply for this course was so co...,Enrolment Process
7,verbatim_8,My work placement was very unorganized and I f...,Work Placement
8,verbatim_9,The person who was meant to help me with my en...,Enrolment Process
9,verbatim_10,"The cost of the textbooks is way too high, and...",Course Fees and Payments
