# Import Required Libraries
Import the necessary libraries, including requests, json, pandas, and IPython.display.

In [3]:
%pip install -q pandas IPython matplotlib numpy ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Import Required Libraries
import requests
import json
import pandas as pd
import argparse
from pathlib import Path
from typing import Dict, List, Optional
import logging
from datetime import datetime
from IPython.display import display

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load Input Data and Prompt
Load the input text and prompt template from the specified file paths.

In [5]:
# Set the input file path
input_file_path = Path('../data/raw_text/raw_text_20241213_175758.txt')

# Set the prompt file path
prompt_file_path = Path('../backend/extraction/prompts/prompt_v1.txt')

# Set the system message file path
system_message_file_path = Path('../backend/extraction/prompts/system_message_v0.txt')

In [6]:
# Function to load text from a file
def load_text_file(file_path: str) -> str:
    """Load text from file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except Exception as e:
        logging.error(f"Error loading file {file_path}: {str(e)}")
        raise

# Load input text and prompt
input_text = load_text_file(input_file_path)
prompt = load_text_file(prompt_file_path)
system_message = load_text_file(system_message_file_path)

# Format full prompt
full_prompt = prompt.format(text=input_text)

print("Sample of input text:")
print(input_text[:200] + "...")

Sample of input text:
/* Metadata:
{
  "timestamp": "20241213_175758",
  "characters": 48692,
  "lines": 1
}
*/

<<< Return to selection page Campus: Blacksbur g - Term: Fall 2024 CRN Course TitleSchedule TypeModalityCr Hr...


# Test LLM Extraction
Call the Ollama API with the formatted prompt, parse the results, and save them to a CSV file.

In [11]:
def call_ollama(prompt: str, system_prompt: str, temperature: float = 0.2) -> str:
    """Call Ollama API with the given prompt"""
    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                'model': 'qwen2.5:14b-instruct',
                'system': system_prompt,
                'prompt': prompt,
                'stream': False,
                'temperature': temperature
            }
        )
        response.raise_for_status()
        return response.json()['response']
    except Exception as e:
        logging.error(f"Error calling Ollama API: {str(e)}")
        raise

def parse_course_data(llm_output: str) -> List[Dict]:
    """Parse LLM output into structured course data"""
    try:
        # Assuming LLM outputs JSON format
        courses = json.loads(llm_output)
        if not isinstance(courses, list):
            courses = [courses]
        return courses
    except json.JSONDecodeError as e:
        logging.error(f"Error parsing LLM output as JSON: {str(e)}")
        logging.error(f"Raw output: {llm_output}")
        return []

In [12]:
# Test LLM Extraction

# Call LLM
logging.info("Calling Ollama API...")
llm_output = call_ollama(full_prompt, system_message)

# Parse results
courses = parse_course_data(llm_output)

# Convert to DataFrame
df = pd.DataFrame(courses)

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"extracted_courses_{timestamp}.csv"
df.to_csv(output_file, index=False)

print(f"\nExtracted {len(courses)} courses. Results saved to {output_file}")
display(df.head())

2024-12-13 14:53:19,258 - INFO - Calling Ollama API...
2024-12-13 14:54:16,050 - ERROR - Error parsing LLM output as JSON: Expecting value: line 1 column 1 (char 0)
2024-12-13 14:54:16,064 - ERROR - Raw output: Based on the provided course information from Virginia Tech, here are the key details for AOE-7994 (Research and Dissertation) offered both in-person and online during Fall 2023:

### In-Person Sections:
1. **CRN 80711:**
   - Instructor: SS Choi
   - Days/Time: Monday, Wednesday, Friday from 12:30 PM to 1:20 PM
   - Room: McBryde Hall (MB) 450

2. **CRN 80719:**
   - Instructor: EG Paterson
   - Days/Time: Tuesday and Thursday from 11:00 AM to 12:15 PM
   - Room: Randolph Hall (RA) 309

### Online Sections:
1. **CRN 80723:**
   - Instructor: AJ Brown, SL England, WJ Devinport, GD Seidel, and more
   - Format: Virtual Campus (VR)
   - Comments: "Virtual campus"

2. **CRN 80727:**
   - Instructor: CA Woolsey, G Young, JT Black
   - Format: Virtual Campus (VR)
   - Comments: "onli


Extracted 0 courses. Results saved to extracted_courses_20241213_145416.csv


# Evaluation Metrics
Calculate and display quality metrics for the extracted data.

In [None]:
# Evaluation Metrics

def calculate_metrics(df: pd.DataFrame) -> Dict:
    """Calculate quality metrics for extracted data"""
    metrics = {
        'total_courses': len(df),
        'completeness': {}
    }
    
    # Calculate completeness for each field
    for column in df.columns:
        non_null = df[column].notna().sum()
        completeness = (non_null / len(df)) * 100 if len(df) > 0 else 0
        metrics['completeness'][column] = round(completeness, 2)
    
    return metrics

# Calculate and display metrics
metrics = calculate_metrics(df)
print("\nExtraction Metrics:")
print(json.dumps(metrics, indent=2))