In [1]:
import os
import json
import time
import pandas as pd
from datetime import datetime
from ollama import Client # Import Ollama client
from concurrent.futures import ThreadPoolExecutor, as_completed # For concurrent processing

## Connecting to Ollama and checking if model exists

If model is not found, then subprocess will try to download the model from Ollama.  

In [2]:
# Constants
OLLAMA_HOST = "http://localhost:11434" # Default Ollama host
HEADERS = {"Content-Type": "application/json"}
MODEL = "qwen3:30b"

In [3]:
import subprocess

# Check if the model is already downloaded
try:
    result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
    if MODEL in result.stdout:
        print(f"Model '{MODEL}' found.")
    else:
        print(f"Model '{MODEL}' not found. Downloading...")
        # Download the model
        subprocess.run(['ollama', 'pull', MODEL], check=True)
        print(f"Model '{MODEL}' downloaded successfully.")
except subprocess.CalledProcessError as e:
    print(f"Error checking or downloading Ollama model: {e}")
    print(f"Stderr: {e.stderr}")
except FileNotFoundError:
    print("Ollama command not found. Please ensure Ollama is installed and in your system's PATH.")


Model 'qwen3:30b' found.


## Create system and user prompts to input to the model
Most models have been trained to receive instructions in a particular way. They expect to receive:

**A system prompt** that tells them what task they are performing and what tone they should use

**A user prompt** -- the conversation starter that they should reply to

In [4]:
# Define the system prompt
system_prompt = """You are an expert topic and sentiment classifier for student feedback from a vocational education and training institution.

**CRITICAL: You MUST respond with ONLY valid JSON. No explanations, no thinking process, no <think> tags.**

Your task is to analyse student verbatims and, for each identified topic from a predefined list, assign a sentiment label.

**Topic List:**
- Enrolment Process
- Student Support Services
- Course Content and Relevance
- Trainer Quality and Engagement
- Facilities and Campus Environment
- Timetable and Scheduling
- Online Learning Platform
- Assessment and Feedback
- Career and Employment Services
- Technology and Equipment
- Communication and Information
- Student Welfare and Wellbeing
- Course Fees and Payments
- Recognition of Prior Learning (RPL)
- Work Placement
- Graduation and Completion

**Sentiment Labels:**
- Positive
- Negative
- Neutral

**Classification Rules:**
1. Match the verbatim to the most relevant topics from the topic list above
2. If not relevant to any topic, use 'No Match' as the topic with 'N/A' sentiment
3. You can assign multiple topics if the verbatim covers multiple subjects
4. For each topic, assign: 'Positive', 'Negative', or 'Neutral'
5. If a topic is mentioned without clear sentiment (statement of fact), use 'Neutral'

**MANDATORY JSON OUTPUT FORMAT:**
{
  "verbatim_text": "the original student verbatim",
  "topics": [
    {"topic": "Topic Name From List", "sentiment": "Positive"},
    {"topic": "Another Topic Name", "sentiment": "Negative"}
  ]
}

**CRITICAL FORMAT RULES:**
- Each topic object MUST have EXACTLY two keys: "topic" and "sentiment"
- "topic" value MUST be a string from the Topic List (or "No Match")
- "sentiment" value MUST be one of: "Positive", "Negative", "Neutral", "N/A"
- NO extra keys allowed in topic objects
- NO nested structures"""


In [5]:
# A function that writes a User Prompt that asks for topics from verabtims:
def user_prompt_for(verbatim):
    user_prompt = f"""You are looking at a verbatim from a student. Based on the list of topics and sentiment labels provided in the system prompt, classify the verbatim below.

**IMPORTANT: Respond with ONLY a JSON object in the exact format shown in the examples.**

**Examples for Few-Shot Classification:**

1. **Verbatim:** "My trainer is great, but the course content is outdated."
   **Output:**
   {{
     "verbatim_text": "My trainer is great, but the course content is outdated.",
     "topics": [
       {{"topic": "Trainer Quality and Engagement", "sentiment": "Positive"}},
       {{"topic": "Course Content and Relevance", "sentiment": "Negative"}}
     ]
   }}

2. **Verbatim:** "I'm having trouble with the login for the online portal, and the Wi-Fi on campus is really slow."
   **Output:**
   {{
     "verbatim_text": "I'm having trouble with the login for the online portal, and the Wi-Fi on campus is really slow.",
     "topics": [
       {{"topic": "Online Learning Platform", "sentiment": "Negative"}},
       {{"topic": "Technology and Equipment", "sentiment": "Negative"}}
     ]
   }}

3. **Verbatim:** "The new library on campus is fantastic, and the resources are excellent."
   **Output:**
   {{
     "verbatim_text": "The new library on campus is fantastic, and the resources are excellent.",
     "topics": [
       {{"topic": "Facilities and Campus Environment", "sentiment": "Positive"}}
     ]
   }}

4. **Verbatim:** "I received an email about my enrolment but it is useless information. I just want to know the status of my application."
   **Output:**
   {{
     "verbatim_text": "I received an email about my enrolment but it is useless information. I just want to know the status of my application.",
     "topics": [
       {{"topic": "Enrolment Process", "sentiment": "Neutral"}},
       {{"topic": "Communication and Information", "sentiment": "Negative"}}
     ]
   }}

5. **Verbatim:** "I've been working as a mechanic for 10 years, and I want to see if I can get credit for my experience towards this course."
   **Output:**
   {{
     "verbatim_text": "I've been working as a mechanic for 10 years, and I want to see if I can get credit for my experience towards this course.",
     "topics": [
       {{"topic": "Recognition of Prior Learning (RPL)", "sentiment": "Neutral"}}
     ]
   }}

**New Verbatim to Classify:**
{verbatim}

**Your JSON Output:**"""
    
    return user_prompt

**Create Message for the model** : The API from OpenAI expects to receive messages in a particular structure.

```python
[
    {"role": "system", "content": "system message goes here"},
    {"role": "user", "content": "user message goes here"}
]
```

In [6]:
# Create the message structure
def messages_for(verbatim):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(verbatim)}
    ]

## Bring it together

In [7]:
import re

def strip_think_tags(response_content):
    """Remove <think>...</think> tags from DeepSeek R1 responses"""
    cleaned = re.sub(r'<think>.*?</think>', '', response_content, flags=re.DOTALL)
    return cleaned.strip()

def validate_and_fix_topics(topics_data):
    """Validate and attempt to fix malformed topic structures"""
    
    if not isinstance(topics_data, list):
        return [{"topic": "No Match", "sentiment": "N/A"}]
    
    validated = []
    
    for item in topics_data:
        if not isinstance(item, dict):
            continue
        
        topic = None
        sentiment = None
        
        # Standard format check
        if 'topic' in item and 'sentiment' in item:
            topic = item.get('topic')
            sentiment = item.get('sentiment')
        else:
            # Try to salvage malformed structures
            for key, value in item.items():
                key_lower = key.lower()
                
                # Look for topic
                if key_lower == 'topic' or (key_lower not in ['sentiment', 'positive', 'negative', 'neutral'] and isinstance(key, str) and len(key) > 3):
                    if not topic:
                        topic = key if key_lower != 'topic' else value
                
                # Look for sentiment
                if key_lower in ['sentiment', 'positive', 'negative', 'neutral']:
                    if not sentiment:
                        sentiment = value if key_lower == 'sentiment' else key.title()
                
                # Check if value contains sentiment keywords
                if isinstance(value, str):
                    value_lower = value.lower()
                    if 'positive' in value_lower and not sentiment:
                        sentiment = 'Positive'
                    elif 'negative' in value_lower and not sentiment:
                        sentiment = 'Negative'
                    elif 'neutral' in value_lower and not sentiment:
                        sentiment = 'Neutral'
        
        # Validate and clean
        if topic and sentiment:
            topic = str(topic).strip()
            sentiment = str(sentiment).strip()
            
            # Ensure sentiment is valid
            if sentiment not in ['Positive', 'Negative', 'Neutral', 'N/A']:
                # Try to extract from the string
                if 'positive' in sentiment.lower():
                    sentiment = 'Positive'
                elif 'negative' in sentiment.lower():
                    sentiment = 'Negative'
                else:
                    sentiment = 'Neutral'
            
            validated.append({
                "topic": topic,
                "sentiment": sentiment
            })
    
    if not validated:
        validated = [{"topic": "No Match", "sentiment": "N/A"}]
    
    return validated

def clean_and_parse_json(response_content):
    """Clean and parse JSON, handling DeepSeek thinking tags"""
    
    # Remove think tags first
    response_content = strip_think_tags(response_content)
    
    # Try direct parse
    try:
        return json.loads(response_content)
    except json.JSONDecodeError:
        pass
    
    # Try to extract JSON object
    try:
        json_match = re.search(r'\{.*\}', response_content, re.DOTALL)
        if json_match:
            cleaned = json_match.group(0)
            return json.loads(cleaned)
    except Exception:
        pass
    
    return None

# Create an Ollama client instance
ollama_client = Client(host=OLLAMA_HOST)

def return_topics(verbatim_obj, max_retries=2):
    verbatim = verbatim_obj['verbatim_text']
    custom_id = verbatim_obj['custom_id']
    
    for attempt in range(max_retries + 1):
        try:        
            response = ollama_client.chat(
                model=MODEL,
                messages=messages_for(verbatim),
                options={
                    "temperature": 0.0,
                    "top_p": 0.9,
                    "num_predict": 300,
                },
                format="json"
            )

            # Get Ollama response content
            response_content = response['message']['content']
            
            # Strip <think> tags if they exist
            response_content = strip_think_tags(response_content)

            # Parse the JSON string from the model's response
            classification_data = json.loads(response_content)

            # Extract topics
            topics = classification_data.get('topics', [])
            
            # Validate and fix malformed topics (if exists)
            validated_topics = validate_and_fix_topics(topics)
            
            # Handle empty or invalid topics list
            if not validated_topics or len(validated_topics) == 0:
                print(f"Warning: Empty topics for ID {custom_id}. Attempt {attempt + 1}/{max_retries + 1}")
                if attempt < max_retries:
                    time.sleep(0.5)
                    continue
                else:
                    # After retries, assign a default
                    validated_topics = [{"topic": "No Match", "sentiment": "N/A"}]

            # Return validated_topics, not the original
            return {
                'custom_id': custom_id,
                'topics_and_sentiments': validated_topics,  # Use validated version
                'verbatim_text': verbatim
            }

        except json.JSONDecodeError as e:
            print(f"JSON decoding error for ID {custom_id} (Attempt {attempt + 1}/{max_retries + 1}): {e}")
            print(f"Raw response: {response_content[:200]}...")
            
            if attempt < max_retries:
                time.sleep(0.5)
                continue
            else:
                return {
                    'custom_id': custom_id,
                    'topics_and_sentiments': [{"topic": "Error: Invalid Response", "sentiment": "N/A"}],
                    'verbatim_text': verbatim,
                    'error': 'JSON decode failed after retries'
                }
                
        except Exception as e:
            print(f"Error processing verbatim ID {custom_id} (Attempt {attempt + 1}/{max_retries + 1}): {e}")
            
            # **FIX 4: Add retry logic to general exception**
            if attempt < max_retries:
                time.sleep(0.5)
                continue
            else:
                return {
                    'custom_id': custom_id,
                    'topics_and_sentiments': [{"topic": "Error: API Call Failed", "sentiment": "N/A"}],
                    'verbatim_text': verbatim,
                    'error': str(e)
                }

## Create Batch 

In [8]:
# Creating a batch to verify the model quality. This will need to be replaced by verbatims from the backend
# for actual implementation. Please use separate DB connections strings and queries to fetch and transform
# the data to rearranage to this format.

# Sample list of verbatims
verbatims_to_classify = [
    "My classes are all over the place. I have to come to campus three times a week for just one or two hours each time.",
    "The course material is outdated; we're still learning about software from five years ago.",
    "I tried to get help from the student welfare office, but they were closed.",
    "My final assignment feedback was very vague, and I don't know what to improve on.",
    "I'm not sure if this is the right career path for me after finishing this course.",
    "The computer labs have really old computers, and some don't even work properly.",
    "The process to apply for this course was so confusing, but the website is quite easy to navigate.",
    "My work placement was very unorganized and I felt like I didn't learn anything.",
    "The person who was meant to help me with my enrolment never got back to me.",
    "The cost of the textbooks is way too high, and I'm not sure if I can afford them.",
    "This feedback is not about any of the topics.",
    "I need help with my resume and job applications after I graduate.",
    "The fees for next semester seem to have increased without much warning. The student support staff is trying to be helpful but they cannot do much either",
    "The campus security could be better, I don't feel entirely safe at night.",
    "I really enjoy the practical exercises in this course; they are very relevant to industry.",
]

In [9]:
# Prepare data for concurrent processing

# Note: This is test code so custom ID has been created to track each verbatim. In proper 
# environments, each verbatim will have a unique ID. The suggested format of ID would be 
# responseID_qid to get a unique verbatim and maintain traceability.
data_for_processing = []
for i, verbatim in enumerate(verbatims_to_classify):
    data_for_processing.append({"custom_id": f"verbatim_{i+1}", "verbatim_text": verbatim})


In [10]:
# --- Concurrent Processing with ThreadPoolExecutor ---

# max_workers should be adjusted based on your system's resources (CPU cores, VRAM)
# and OLLAMA_NUM_PARALLEL setting. Start with a small number like 2-4.
MAX_CONCURRENT_REQUESTS = 2

In [11]:
print(f"\nStarting batch processing with {MAX_CONCURRENT_REQUESTS} concurrent requests...")

parsed_results = []
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
    # Submit all verbatims to the executor
    job_queue = {executor.submit(return_topics, item): item for item in data_for_processing}
    
    # As results complete, retrieve them
    for job in as_completed(job_queue):
        result = job.result()
        if result: # Only add if result is not None (in case of total failure)
            parsed_results.append(result)
        # Optional: Add a progress indicator
        print(f"Processed {len(parsed_results)}/{len(data_for_processing)} verbatims...", end='\r')

print("\nBatch processing complete.")


Starting batch processing with 2 concurrent requests...
Processed 15/15 verbatims...
Batch processing complete.


In [12]:
for i, result in enumerate(parsed_results):
    print(f"Verbatim {i+1} : {result['verbatim_text']}")
    
    for row in result['topics_and_sentiments']:
        print(row)
        print(f"\tTopic : {row['topic']}")
        print(f"\tSentiment : {row['sentiment']}\n")
    
    print("")    

Verbatim 1 : My classes are all over the place. I have to come to campus three times a week for just one or two hours each time.
{'topic': 'Timetable and Scheduling', 'sentiment': 'Negative'}
	Topic : Timetable and Scheduling
	Sentiment : Negative


Verbatim 2 : The course material is outdated; we're still learning about software from five years ago.
{'topic': 'Course Content and Relevance', 'sentiment': 'Negative'}
	Topic : Course Content and Relevance
	Sentiment : Negative


Verbatim 3 : I tried to get help from the student welfare office, but they were closed.
{'topic': 'Student Welfare and Wellbeing', 'sentiment': 'Negative'}
	Topic : Student Welfare and Wellbeing
	Sentiment : Negative


Verbatim 4 : My final assignment feedback was very vague, and I don't know what to improve on.
{'topic': 'Assessment and Feedback', 'sentiment': 'Negative'}
	Topic : Assessment and Feedback
	Sentiment : Negative


Verbatim 5 : I'm not sure if this is the right career path for me after finishing thi

In [13]:
def format_topics_and_sentiments(topics_list):
    if len(topics_list)==0:
        return ""
    
    # Create formatted string
    formatted_pairs = [f"{item['topic']} ({item['sentiment']})" for item in topics_list]
    
    # Join values with comma and space
    return ", ".join(formatted_pairs)


In [14]:
# Convert result to dataframe
df = pd.DataFrame(parsed_results)

# Format topics list. If blank, then show No Match
df['topics_and_sentiments'] = df['topics_and_sentiments'].apply(format_topics_and_sentiments)
df['topics_and_sentiments'] = df['topics_and_sentiments'].apply(lambda x: 'No Match' if len(x)==0 else x)

# You can export the dataframe to CSV for further use by uncommenting the 2 lines below. By default 
# there is a custom ID for each verbatim. This will need to be tweaked based on the input structure. 
# The suggested format of ID would be responseID_qid to get a unique verbatim and maintain traceability.

# df.to_csv('result_verbatims.csv', index=False)
# print("Data exported to result_verbatims.csv")

print("\n--- Final Results DataFrame ---")
display(df[['verbatim_text','topics_and_sentiments']])


--- Final Results DataFrame ---


Unnamed: 0,verbatim_text,topics_and_sentiments
0,My classes are all over the place. I have to c...,Timetable and Scheduling (Negative)
1,The course material is outdated; we're still l...,Course Content and Relevance (Negative)
2,I tried to get help from the student welfare o...,Student Welfare and Wellbeing (Negative)
3,"My final assignment feedback was very vague, a...",Assessment and Feedback (Negative)
4,I'm not sure if this is the right career path ...,Career and Employment Services (Neutral)
5,"The computer labs have really old computers, a...",Technology and Equipment (Negative)
6,The process to apply for this course was so co...,"Enrolment Process (Negative), Online Learning ..."
7,My work placement was very unorganized and I f...,Work Placement (Negative)
8,The person who was meant to help me with my en...,Enrolment Process (Negative)
9,"The cost of the textbooks is way too high, and...",Course Fees and Payments (Negative)


In [15]:
# Test 1
test_verbatim = {"custom_id": "test_1", "verbatim_text": "My trainer is great, but the course content is outdated."}
result = return_topics(test_verbatim)

for i, row in enumerate(result['topics_and_sentiments']):
    print(f"Topic {i+1}: {row['topic']}")
    print(f"Sentiment: {row['sentiment']}\n")


Topic 1: Trainer Quality and Engagement
Sentiment: Positive

Topic 2: Course Content and Relevance
Sentiment: Negative



In [16]:
# Test 2
test_verbatim = {"custom_id": "test_2", "verbatim_text": "I'm not sure if this is the right career path for me after finishing this course."}
result = return_topics(test_verbatim)
for i, row in enumerate(result['topics_and_sentiments']):
    print(f"Topic {i+1}: {row['topic']}")
    print(f"Sentiment: {row['sentiment']}\n")

Topic 1: Career and Employment Services
Sentiment: Neutral

