# B.3 Category Classification

This notebook contains our LLM pipeline for the category classification. It additionally incorporates our initial location classification (whether a specific location is mentioned or not), however, we decided to not utilize these results for our analysis, as we conducted an additional, more precise location classification in notebook B.4.

In [None]:
import requests
import pickle
import yaml
import os
import google.generativeai as genai
import pandas as pd
import time
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
url = "https://raw.githubusercontent.com/nicosrp/The-Architecture-of-Aspiration-A-Network-Perspective-on-Human-Goals/main/Networks/Prior%20Network%20Versions/b1_network.pkl"
response = requests.get(url)
response.raise_for_status()

G = pickle.loads(response.content)

In [3]:
# show all node attribute names
print(next(iter(G.nodes(data=True)))[1].keys())

dict_keys(['title', 'description', 'wants_to_do', 'have_done', 'comments', 'tags', 'included_by_our_users', 'merged_goals'])


## Setup Classification Pipeline

Load configuration and set classification parameters

In [4]:
# Load configuration
with open('../Data/Classification Setup Data/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load categories
with open('../Data/Classification Setup Data/categories.yaml', 'r') as f:
    categories_config = yaml.safe_load(f)
    
print(f"Loaded {len(categories_config['categories'])} categories for classification")
print(f"Loaded {len(categories_config['locations'])} location categories for classification")
print(f"Using model: {config['api']['model']}")

Loaded 15 categories for classification
Loaded 2 location categories for classification
Using model: models/gemma-3-27b-it


In [5]:
# CLASSIFICATION SETTINGS

# Load API key from config file
GEMINI_API_KEY = config['api'].get('api_key', None)
if not GEMINI_API_KEY:
    # Try environment variable as fallback
    GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', None)
    
if not GEMINI_API_KEY:
    print("WARNING: API_KEY not found!")
    print("Please set it in config.yaml or as environment variable")
else:
    print("API key loaded successfully")

# Classification mode
MUTUALLY_EXCLUSIVE = True
# Justification settings
INCLUDE_REASONING = True  # Set to False to skip reasoning

# Processing options
MAX_NODES_TO_PROCESS = None  # Set to a number (e.g., 100) to test on subset, None for all nodes
START_FROM_NODE = 0  # Skip first N nodes
SAVE_PROGRESS_EVERY = 50  # Save intermediate results every N nodes

# Display settings
VERBOSE = True  # Show detailed progress
SHOW_SAMPLE_CLASSIFICATIONS = 5  # Number of sample results to display

print(f"Classification Mode: {'MUTUALLY EXCLUSIVE (single category)' if MUTUALLY_EXCLUSIVE else 'MULTI-LABEL (multiple categories)'}")
print(f"Include Reasoning: {'Yes' if INCLUDE_REASONING else 'No'}")
print(f"Nodes to process: {MAX_NODES_TO_PROCESS if MAX_NODES_TO_PROCESS else 'All'}")
print(f"Starting from node: {START_FROM_NODE}")

API key loaded successfully
Classification Mode: MUTUALLY EXCLUSIVE (single category)
Include Reasoning: Yes
Nodes to process: All
Starting from node: 0


## Initialize Gemini API and Classification Functions

In [6]:
# Configure Gemini API
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    
    # Initialize model with generation config
    generation_config = {
        "temperature": config['api']['temperature'],
        "top_p": config['api']['top_p'],
        "max_output_tokens": config['api']['max_tokens'],
    }
    
    # Only add JSON mode for models that support it (not Gemma)
    if 'gemma' not in config['api']['model'].lower():
        generation_config["response_mime_type"] = "application/json"
    
    model = genai.GenerativeModel(
        model_name=config['api']['model'],
        generation_config=generation_config,
    )
    
    print("Gemini model initialized successfully")
    if 'gemma' in config['api']['model'].lower():
        print("  Note: Using Gemma model - JSON extraction from text")
else:
    print("Cannot initialize model without API key")

Gemini model initialized successfully
  Note: Using Gemma model - JSON extraction from text


In [7]:
def create_classification_prompt(title: str, description: str, categories: list, locations: list, mutually_exclusive: bool, include_reasoning: bool) -> str:
    """
    Create a prompt for the LLM to classify a goal into both category and location.
    """
    categories_text = "\n".join([f"- {cat['name']}: {cat['description']}" for cat in categories])
    locations_text = "\n".join([f"- {loc['name']}: {loc['description']}" for loc in locations])
    
    if mutually_exclusive:
        category_instruction = "Select ONLY ONE category that best matches this goal."
        if include_reasoning:
            output_format = '''{"category": "Category Name", "category_confidence": 0.XX, "location": "Location Category", "location_confidence": 0.XX, "reasoning": "Brief explanation"}'''
        else:
            output_format = '''{"category": "Category Name", "category_confidence": 0.XX, "location": "Location Category", "location_confidence": 0.XX}'''
    else:
        category_instruction = "Select categories that truly fit this goal. Be selective - only include categories where you have HIGH confidence (≥ 0.75). Multiple categories should only be selected if the goal genuinely belongs to each of them with strong confidence."
        if include_reasoning:
            output_format = '''{"categories": ["Category 1", "Category 2"], "category_confidence": [0.XX, 0.XX], "location": "Location Category", "location_confidence": 0.XX, "reasoning": "Brief explanation"}'''
        else:
            output_format = '''{"categories": ["Category 1", "Category 2"], "category_confidence": [0.XX, 0.XX], "location": "Location Category", "location_confidence": 0.XX}'''
    
    location_instruction = "Additionally, classify the LOCATION MENTIONED in the goal into exactly ONE of the location categories below. Focus on any geographical references, places, or locations mentioned."
    
    prompt = f"""You are a goal classification expert. Classify the following goal into both a category and a location type.

Goal Title: {title}
Goal Description: {description}

Focus primarily on the Goal Title for classification. Use the Goal Description only if the title is unclear or ambiguous.

Available Categories:
{categories_text}

{category_instruction}

Available Location Categories:
{locations_text}

{location_instruction}

Return your response as JSON in this exact format:
{output_format}

Ensure:
1. Category names match exactly from the categories list above
2. Location category names match exactly from the locations list above
3. All confidence values are between 0 and 1
4. For location, select exactly ONE category that best describes the type of location mentioned (or "No Location mentioned" if none)
"""
    
    return prompt


def classify_goal(node_id: str, node_data: dict, model, categories: list, locations: list, mutually_exclusive: bool, include_reasoning: bool) -> dict:
    """
    Classify a single goal using the LLM into both category and location.
    """
    title = node_data.get('title', '')
    description = node_data.get('description', '')
    
    # Handle missing data
    if not title and not description:
        return {
            'node_id': node_id,
            'node_attributes': node_data,
            'classification': None,
            'error': 'No title or description available'
        }
    
    # Create prompt
    prompt = create_classification_prompt(title, description, categories, locations, mutually_exclusive, include_reasoning)
    
    try:
        # Call API
        response = model.generate_content(prompt)
        
        # Parse JSON response
        response_text = response.text.strip()
        
        # Try to extract JSON from response (handles both pure JSON and text with JSON)
        try:
            classification_data = json.loads(response_text)
        except json.JSONDecodeError:
            # If direct parsing fails, try to extract JSON from markdown code blocks
            import re
            json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
            if json_match:
                classification_data = json.loads(json_match.group(1))
            else:
                # Try to find any JSON object in the text
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    classification_data = json.loads(json_match.group(0))
                else:
                    raise ValueError(f"Could not extract JSON from response: {response_text[:200]}")
        
        return {
            'node_id': node_id,
            'node_attributes': node_data,
            'classification': classification_data,
            'error': None
        }
    
    except Exception as e:
        return {
            'node_id': node_id,
            'node_attributes': node_data,
            'classification': None,
            'error': str(e)
        }

## Test Classification

Here, we run a quick test on a single node to verify everything works before processing all nodes.

In [8]:
# Test on a single node
test_node = list(G.nodes(data=True))[0]
test_node_id, test_node_data = test_node

print("Testing classification on a sample node...")
print(f"Node ID: {test_node_id}")
print(f"Title: {test_node_data.get('title', 'N/A')}")
print(f"Description: {test_node_data.get('description', 'N/A')[:200]}...")

# Run classification
test_result = classify_goal(
    node_id=test_node_id,
    node_data=test_node_data,
    model=model,
    categories=categories_config['categories'],
    locations=categories_config['locations'],
    mutually_exclusive=MUTUALLY_EXCLUSIVE,
    include_reasoning=INCLUDE_REASONING
)

print("Test Result:")
if test_result['error']:
    print(f"Error: {test_result['error']}")
else:
    print(f"Classification: {test_result['classification']}")
print(f"{'='*50}\n")
print("Test successful!")

Testing classification on a sample node...
Node ID: dQggEQQH
Title: Make ice cream from scratch
Description: Ice cream or ice-cream is a frozen dessert usually made from dairy products, such as milk and cream, and often combined with fruits or other ingredients and flavours. Most varieties contain sugar, alt...
Test Result:
Classification: {'category': 'Food', 'category_confidence': 0.95, 'location': 'No location mentioned', 'location_confidence': 0.98, 'reasoning': 'The goal explicitly involves making food (ice cream). The goal does not mention any specific location.'}

Test successful!
Test Result:
Classification: {'category': 'Food', 'category_confidence': 0.95, 'location': 'No location mentioned', 'location_confidence': 0.98, 'reasoning': 'The goal explicitly involves making food (ice cream). The goal does not mention any specific location.'}

Test successful!


## Run Classification Pipeline

This cell will process all nodes and classify them using the LLM. We did not run the classificatio cell again after getting the results form our first run, therefore there is no output displayed. However, we created a subsequent cell where we call our previous results.

In [None]:
# Initialize results storage
classification_results = []

# Get all nodes
all_nodes = list(G.nodes(data=True))
print(f"Total nodes in graph: {len(all_nodes)}")

# Determine which nodes to process
if MAX_NODES_TO_PROCESS:
    nodes_to_process = all_nodes[START_FROM_NODE:START_FROM_NODE + MAX_NODES_TO_PROCESS]
else:
    nodes_to_process = all_nodes[START_FROM_NODE:]

print(f"Processing {len(nodes_to_process)} nodes (starting from node {START_FROM_NODE})")

# Process each node
print("\nStarting classification...\n")
request_delay = config['rate_limit']['delay_between_requests']

for idx, (node_id, node_data) in enumerate(tqdm(nodes_to_process, desc="Classifying goals")):
    
    # Classify the goal
    result = classify_goal(
        node_id=node_id,
        node_data=node_data,
        model=model,
        categories=categories_config['categories'],
        locations=categories_config['locations'],
        mutually_exclusive=MUTUALLY_EXCLUSIVE,
        include_reasoning=INCLUDE_REASONING
    )
    
    classification_results.append(result)
    
    # Show sample results
    if VERBOSE and idx < SHOW_SAMPLE_CLASSIFICATIONS:
        print(f"\nSample {idx + 1}:")
        print(f"  Node ID: {result['node_id']}")
        print(f"  Title: {result['node_attributes'].get('title', '')[:60]}...")
        if result['error']:
            print(f"  Error: {result['error']}")
        else:
            print(f"  Classification: {result['classification']}")
    
    # Save progress periodically
    if SAVE_PROGRESS_EVERY and (idx + 1) % SAVE_PROGRESS_EVERY == 0:
        with open('../Data/Classification Results/classification1_results.json', 'w') as f:
            json.dump(classification_results, f, indent=2)
        if VERBOSE:
            print(f"\nProgress saved at {idx + 1} nodes")
    
    # Rate limiting
    time.sleep(request_delay)

print(f"\nTotal processed: {len(classification_results)}")
print(f"Errors: {sum(1 for r in classification_results if r['error'])}")

In [9]:
# Re-load previous classification results if a JSON file exists in this folder
json_fname = '../Data/Classification Results/classification1_results.json'
request_delay = config['rate_limit']['delay_between_requests']
if os.path.exists(json_fname):
    try:
        with open(json_fname, 'r', encoding='utf-8') as f:
            classification_results = json.load(f)
        print(f'Loaded classification_results from {json_fname} ({len(classification_results)} entries)')
    except Exception as e:
        print(f'Error loading {json_fname}: {e}')
        # fallback: keep existing in-memory variable if present
        if 'classification_results' not in globals():
            classification_results = []
else:
    if 'classification_results' in globals():
        print(f'No {json_fname} found. Using in-memory classification_results ({len(classification_results)} entries).')
    else:
        print(f'No {json_fname} found and no in-memory classification_results; creating empty list.')
        classification_results = []

# Ensure the rest of the notebook can run from here (counts, etc.)
print(f'Total classification_results entries available: {len(classification_results)}')

Loaded classification_results from ../Data/Classification Results/classification1_results.json (2890 entries)
Total classification_results entries available: 2890


## Retry Failed Classifications

This cell will retry classification for any goals that failed during the initial pipeline run. We run this as long as we get no errors anymore. Notably, the only remaining error we get is re-occuring due to "PROHIBITED_CONTENT". This goal describes reading the book "Lolita", which is controversial due to its plot where an adult is lusting after a minor, leading to this error message.

In [10]:
# Retry failed classifications
failed_indices = [i for i, result in enumerate(classification_results) if result['error']]
print(f"Found {len(failed_indices)} failed classifications to retry")

if failed_indices:
    print("Retrying failed classifications...")
    
    for idx in tqdm(failed_indices, desc="Retrying failed classifications"):
        result = classification_results[idx]
        node_id = result['node_id']
        node_data = result['node_attributes']
        
        # Retry classification
        retry_result = classify_goal(
            node_id=node_id,
            node_data=node_data,
            model=model,
            categories=categories_config['categories'],
            locations=categories_config['locations'],
            mutually_exclusive=MUTUALLY_EXCLUSIVE,
            include_reasoning=INCLUDE_REASONING
        )
        
        # Update the result if retry was successful
        if not retry_result['error']:
            classification_results[idx] = retry_result
            print(f"Fixed classification for node {node_id}")
        else:
            print(f"Still failed for node {node_id}: {retry_result['error']}")
        
        # Rate limiting
        time.sleep(request_delay)
    
    # Save updated results
    with open('../Data/Classification Results/classification1_results.json', 'w') as f:
        json.dump(classification_results, f, indent=2)
    
    final_errors = sum(1 for r in classification_results if r['error'])
    print(f"\nRetry complete!")
    print(f"Remaining errors: {final_errors}")
else:
    print("No failed classifications to retry.")

Found 1 failed classifications to retry
Retrying failed classifications...


Retrying failed classifications:   0%|          | 0/1 [00:00<?, ?it/s]

Still failed for node Csz8rzLw: Invalid operation: The `response.parts` quick accessor requires a single candidate, but but `response.candidates` is empty.
This appears to be caused by a blocked prompt, see `response.prompt_feedback`: block_reason: PROHIBITED_CONTENT



Retrying failed classifications: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]




Retry complete!
Remaining errors: 1


## Inspect Unique Classification Entries

Display all unique category and location entries to identify any incorrect formulations.

In [11]:
# Collect all unique categories and locations from successful classifications
unique_categories = set()
unique_locations = set()

for result in classification_results:
    if result['classification'] and not result['error']:
        if MUTUALLY_EXCLUSIVE:
            # Single category mode
            category = result['classification'].get('category', '')
            location = result['classification'].get('location', '')
            if category:
                unique_categories.add(category)
            if location:
                unique_locations.add(location)
        else:
            # Multi-label mode
            categories = result['classification'].get('categories', [])
            location = result['classification'].get('location', '')
            for cat in categories:
                if cat:
                    unique_categories.add(cat)
            if location:
                unique_locations.add(location)

print(f"{'='*50}")
print("UNIQUE CLASSIFICATION ENTRIES")
print(f"{'='*50}")
print(f"\nUnique Categories ({len(unique_categories)}):")
for cat in sorted(unique_categories):
    print(f"  - {cat}")

print(f"\nUnique Locations ({len(unique_locations)}):")
for loc in sorted(unique_locations):
    print(f"  - {loc}")

print(f"\n{'='*50}")

# Check for potential issues
expected_categories = {cat['name'] for cat in categories_config['categories']}
expected_locations = {loc['name'] for loc in categories_config['locations']}

unexpected_categories = unique_categories - expected_categories
unexpected_locations = unique_locations - expected_locations

if unexpected_categories:
    print(f"WARNING: Found {len(unexpected_categories)} unexpected category entries:")
    for cat in sorted(unexpected_categories):
        print(f"    - {cat}")

if unexpected_locations:
    print(f"WARNING: Found {len(unexpected_locations)} unexpected location entries:")
    for loc in sorted(unexpected_locations):
        print(f"    - {loc}")

if not unexpected_categories and not unexpected_locations:
    print("All classifications use expected category and location names.")
print(f"{'='*50}")

UNIQUE CLASSIFICATION ENTRIES

Unique Categories (15):
  - Academic and professional achievements
  - Acts of kindness/altruism
  - Creativity
  - Events & concerts
  - Experiences
  - Food
  - Health
  - Media consumption
  - Nature
  - New skills
  - Places of interest
  - Relationships & social life
  - Religion
  - Sports
  - Travel destinations

Unique Locations (2):
  - Location mentioned
  - No location mentioned

All classifications use expected category and location names.


Create a structured Excel file with all classification results for manual review.

In [12]:
# Prepare data for Excel export
excel_data = []

for result in classification_results:
    row = {
        'node_id': result['node_id'],
        'title': result['node_attributes'].get('title', ''),
        'description': result['node_attributes'].get('description', ''),
        'error': result['error']
    }
    
    if result['classification'] and not result['error']:
        if MUTUALLY_EXCLUSIVE:
            # Single category case
            row['category'] = result['classification'].get('category', '')
            row['category_confidence'] = result['classification'].get('category_confidence', '')
            row['location'] = result['classification'].get('location', '')
            row['location_confidence'] = result['classification'].get('location_confidence', '')
            row['reasoning'] = result['classification'].get('reasoning', '')
        else:
            # Multiple categories case
            categories_list = result['classification'].get('categories', [])
            confidences_list = result['classification'].get('category_confidence', [])
            row['categories'] = ', '.join(categories_list) if categories_list else ''
            row['category_confidences'] = ', '.join([str(c) for c in confidences_list]) if confidences_list else ''
            row['location'] = result['classification'].get('location', '')
            row['location_confidence'] = result['classification'].get('location_confidence', '')
            row['reasoning'] = result['classification'].get('reasoning', '')
    else:
        if MUTUALLY_EXCLUSIVE:
            row['category'] = ''
            row['category_confidence'] = ''
            row['location'] = ''
            row['location_confidence'] = ''
            row['reasoning'] = ''
        else:
            row['categories'] = ''
            row['category_confidences'] = ''
            row['location'] = ''
            row['location_confidence'] = ''
            row['reasoning'] = ''
    
    excel_data.append(row)

# Create DataFrame
df_results = pd.DataFrame(excel_data)

# Save to Excel with formatting
output_filename = f'../Data/Classification Results/classification2_results.xlsx'

with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
    df_results.to_excel(writer, sheet_name='Classifications', index=False)
    
    # Get the worksheet
    worksheet = writer.sheets['Classifications']
    
    # Adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
        worksheet.column_dimensions[column_letter].width = adjusted_width

print(f"Excel file saved: {output_filename}")
print(f"\nDataFrame shape: {df_results.shape}")
print(f"\nFirst few rows:")
print(df_results.head())

# Display summary statistics
print(f"\n{'='*50}")
print("Classification Summary:")
print(f"{'='*50}")

if MUTUALLY_EXCLUSIVE:
    category_counts = df_results['category'].value_counts()
    print(f"\nCategory distribution:")
    print(category_counts)
    
    # Calculate mean confidence, handling empty strings
    category_confidences = pd.to_numeric(df_results['category_confidence'], errors='coerce').dropna()
    if not category_confidences.empty:
        print(f"\nMean category confidence: {category_confidences.mean():.3f}")
    else:
        print(f"\nMean category confidence: N/A (no valid confidence values)")
    
    location_counts = df_results['location'].value_counts()
    print(f"\nLocation distribution:")
    print(location_counts)
    
    # Calculate mean location confidence, handling empty strings
    location_confidences = pd.to_numeric(df_results['location_confidence'], errors='coerce').dropna()
    if not location_confidences.empty:
        print(f"\nMean location confidence: {location_confidences.mean():.3f}")
    else:
        print(f"\nMean location confidence: N/A (no valid confidence values)")
else:
    # For multi-label, count how many goals have each category
    all_categories = []
    for cats in df_results['categories']:
        if cats:
            all_categories.extend([c.strip() for c in str(cats).split(',')])
    category_counts = pd.Series(all_categories).value_counts()
    print(f"\nCategory distribution (multi-label):")
    print(category_counts)
    
    location_counts = df_results['location'].value_counts()
    print(f"\nLocation distribution:")
    print(location_counts)

errors_count = df_results['error'].notna().sum()
print(f"\nErrors: {errors_count}")
print(f"{'='*50}")

Excel file saved: ../Data/Classification Results/classification2_results.xlsx

DataFrame shape: (2890, 9)

First few rows:
    node_id                                              title  \
0  dQggEQQH                        Make ice cream from scratch   
1  nnEnjr7O  Leave an inspirational note inside a book for ...   
2  nusyxQLs                                         Fly a kite   
3  MDTcjx4X                                       Go skydiving   
4  GM7VTx2U                      Visit the Library of Congress   

                                         description error  \
0  Ice cream or ice-cream is a frozen dessert usu...  None   
1  Imagine the joy of discovering a heartfelt mes...  None   
2  A kite is a tethered heavier-than-air or light...  None   
3  Parachuting, including also skydiving, is a me...  None   
4  The Library of Congress is the research librar...  None   

                    category category_confidence               location  \
0                       Food    

## Assign Classifications to Network Nodes

This will add the classification data as new node attributes in the network graph.

In [13]:
# Add classification attributes to nodes
nodes_updated = 0
nodes_failed = 0

for result in classification_results:
    node_id = result['node_id']
    
    # Check if node exists in graph
    if node_id not in G.nodes():
        nodes_failed += 1
        continue
    
    # Add classification data to node
    if result['error']:
        G.nodes[node_id]['classification_error'] = result['error']
        G.nodes[node_id]['classified'] = False
    elif result['classification']:
        if MUTUALLY_EXCLUSIVE:
            # Single category
            G.nodes[node_id]['category'] = result['classification'].get('category', '')
            G.nodes[node_id]['category_confidence'] = result['classification'].get('category_confidence', 0)
            G.nodes[node_id]['location'] = result['classification'].get('location', '')
            G.nodes[node_id]['location_confidence'] = result['classification'].get('location_confidence', 0)
            G.nodes[node_id]['classification_reasoning'] = result['classification'].get('reasoning', '')
            G.nodes[node_id]['classified'] = True
        else:
            # Multiple categories - store as list
            G.nodes[node_id]['categories'] = result['classification'].get('categories', [])
            G.nodes[node_id]['category_confidences'] = result['classification'].get('category_confidence', [])
            G.nodes[node_id]['location'] = result['classification'].get('location', '')
            G.nodes[node_id]['location_confidence'] = result['classification'].get('location_confidence', 0)
            G.nodes[node_id]['classification_reasoning'] = result['classification'].get('reasoning', '')
            G.nodes[node_id]['classified'] = True
        
        nodes_updated += 1
    else:
        G.nodes[node_id]['classified'] = False
        nodes_failed += 1

print(f"{'='*50}")
print(f"Node Attribute Assignment Complete")
print(f"{'='*50}")
print(f"Nodes updated: {nodes_updated}")
print(f"Nodes failed: {nodes_failed}")
print(f"Total nodes in graph: {G.number_of_nodes()}")

# Show sample node with new attributes
print(f"\n{'='*50}")
print("Sample Node with Classification:")
print(f"{'='*50}")
for node_id, node_data in list(G.nodes(data=True))[:1]:
    if node_data.get('classified'):
        print(f"Node ID: {node_id}")
        print(f"Title: {node_data.get('title', 'N/A')}")
        if MUTUALLY_EXCLUSIVE:
            print(f"Category: {node_data.get('category', 'N/A')}")
            print(f"Category Confidence: {node_data.get('category_confidence', 'N/A')}")
            print(f"Location: {node_data.get('location', 'N/A')}")
            print(f"Location Confidence: {node_data.get('location_confidence', 'N/A')}")
        else:
            print(f"Categories: {node_data.get('categories', 'N/A')}")
            print(f"Category Confidences: {node_data.get('category_confidences', 'N/A')}")
            print(f"Location: {node_data.get('location', 'N/A')}")
            print(f"Location Confidence: {node_data.get('location_confidence', 'N/A')}")
        print(f"Reasoning: {node_data.get('classification_reasoning', 'N/A')[:100]}...")
        break

Node Attribute Assignment Complete
Nodes updated: 2889
Nodes failed: 0
Total nodes in graph: 2890

Sample Node with Classification:
Node ID: dQggEQQH
Title: Make ice cream from scratch
Category: Food
Category Confidence: 0.95
Location: No location mentioned
Location Confidence: 0.98
Reasoning: The goal explicitly involves making food (ice cream). The goal does not mention any specific locatio...


## Final Save: Export Updated Network to Pickle

Save the network with all classifications to a pickle file for future use.

In [14]:
# show all node attribute names after update
print(f"\nAll node attribute names after classification:")
print(next(iter(G.nodes(data=True)))[1].keys())


All node attribute names after classification:
dict_keys(['title', 'description', 'wants_to_do', 'have_done', 'comments', 'tags', 'included_by_our_users', 'merged_goals', 'category', 'category_confidence', 'location', 'location_confidence', 'classification_reasoning', 'classified'])


In [15]:
# Save the final updated network with classifications to pickle
output_network_file = f'../Networks/Prior Network Versions/b3_network.pkl'

with open(output_network_file, 'wb') as f:
    pickle.dump(G, f)

print(f"Final classified network saved to: {output_network_file}")
print(f"Network contains {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
print(f"All classification data has been preserved in the pickle file.")

Final classified network saved to: ../Networks/Prior Network Versions/b3_network.pkl
Network contains 2890 nodes and 219130 edges
All classification data has been preserved in the pickle file.
