**Using GPT4o**

In [3]:
# Cell 1: Install Required Libraries (Updated for GPT-4o and Data Validation)
!pip install langchain langchain-openai langchain-neo4j pandas pyvis langchain_experimental
!pip install openai  # For direct OpenAI API access
!pip install python-dotenv  # For environment variable management (optional)

print("✅ All required libraries installed successfully!")
print("📋 Installed packages:")
print("   - langchain (core framework)")
print("   - langchain-openai (GPT-4o integration)")
print("   - langchain-neo4j (Neo4j integration)")
print("   - pandas (data manipulation)")
print("   - pyvis (graph visualization)")
print("   - langchain_experimental (graph transformers)")
print("   - openai (direct API access)")

✅ All required libraries installed successfully!
📋 Installed packages:
   - langchain (core framework)
   - langchain-openai (GPT-4o integration)
   - langchain-neo4j (Neo4j integration)
   - pandas (data manipulation)
   - pyvis (graph visualization)
   - langchain_experimental (graph transformers)
   - openai (direct API access)


**Cell 2: Full Setup (API Key, LLM, Transformer, Neo4j)**

This is the main configuration cell. It will:

Import all necessary modules.
Load your Gemini API Key from Colab's secrets.
Initialize the Gemini LLM.
Configure the connection to your Neo4j database.
Set up the LLMGraphTransformer with the strict schema we defined for high-quality data extraction.


In [6]:
# Cell 2: Full Setup (GPT-4o-mini, Complete Clean Neo4j)

import os
import asyncio
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# LangChain Imports
from langchain_core.documents import Document
from langchain.prompts import PromptTemplate
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI  # Changed from Google Gemini to OpenAI
from langchain_neo4j import Neo4jGraph

# Visualization Imports
from pyvis.network import Network

print("🚀 Starting Full Setup...")

# --- 1. Load OpenAI API Key from Colab Secrets ---
try:
    from google.colab import userdata
    openai_api_key = userdata.get('OPENAI_API_KEY')
    print("✅ OpenAI API key retrieved from Colab secrets")
except Exception as e:
    print(f"❌ Error retrieving OpenAI API key: {e}")
    openai_api_key = None

# --- 2. Initialize GPT-4o-mini ---
if openai_api_key:
    try:
        llm = ChatOpenAI(
            model="gpt-4o-mini",  # Using gpt-4o-mini as requested
            temperature=0,        # Deterministic responses
            openai_api_key=openai_api_key,
            max_tokens=4000      # Sufficient for graph extraction
        )
        print("✅ GPT-4o-mini initialized successfully")
    except Exception as e:
        print(f"❌ Error initializing GPT-4o-mini: {e}")
        llm = None
else:
    print("❌ Cannot initialize GPT-4o-mini without API key")
    llm = None

# --- 3. Neo4j Database Connection ---
NEO4J_URI = "neo4j+s://15286d63.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "VvdcFiV8FfMzGFILRMuocittAww1QzMom1zfGPwVCuA"

os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD

try:
    graph = Neo4jGraph()
    print("✅ Neo4j connection established")

    # Test connection
    result = graph.query("RETURN 'Connection successful' as status")
    print(f"✅ Neo4j connection verified: {result[0]['status']}")

except Exception as e:
    print(f"❌ Neo4j connection failed: {e}")
    graph = None

# --- 4. 🧹 COMPLETE ROOM CLEANING (Delete Everything) ---
if graph:
    try:
        print("\n🧹 Starting COMPLETE database cleaning...")

        # Count existing data before cleaning
        node_count = graph.query("MATCH (n) RETURN count(n) as count")[0]['count']
        rel_count = graph.query("MATCH ()-[r]-() RETURN count(r) as count")[0]['count']
        print(f"📊 Before cleaning: {node_count} nodes, {rel_count} relationships")

        # COMPLETE CLEANING - Delete everything
        graph.query("MATCH (n) DETACH DELETE n")

        # Verify cleaning worked
        node_count_after = graph.query("MATCH (n) RETURN count(n) as count")[0]['count']
        rel_count_after = graph.query("MATCH ()-[r]-() RETURN count(r) as count")[0]['count']

        print(f"✅ Cleaning complete: {node_count_after} nodes, {rel_count_after} relationships")
        print("🎉 Database is now completely clean and ready for fresh data!")

    except Exception as e:
        print(f"❌ Error during database cleaning: {e}")

# --- 5. Create Fresh Constraints for Data Integrity ---
if graph:
    try:
        print("\n🔧 Creating fresh data integrity constraints...")

        constraints = [
            "CREATE CONSTRAINT role_id_unique IF NOT EXISTS FOR (r:Role) REQUIRE r.id IS UNIQUE",
            "CREATE CONSTRAINT skill_id_unique IF NOT EXISTS FOR (s:`Technical skill`) REQUIRE s.id IS UNIQUE",
            "CREATE CONSTRAINT soft_skill_id_unique IF NOT EXISTS FOR (s:`Soft skill`) REQUIRE s.id IS UNIQUE",
            "CREATE CONSTRAINT tool_id_unique IF NOT EXISTS FOR (t:Tool) REQUIRE t.id IS UNIQUE",
            "CREATE CONSTRAINT task_id_unique IF NOT EXISTS FOR (t:Task) REQUIRE t.id IS UNIQUE"
        ]

        for constraint in constraints:
            try:
                graph.query(constraint)
                print(f"✅ Created constraint: {constraint.split('FOR')[1].split('REQUIRE')[0].strip()}")
            except Exception as e:
                if "already exists" in str(e).lower():
                    print(f"ℹ️  Constraint already exists (OK)")
                else:
                    print(f"⚠️  Constraint creation issue: {e}")

    except Exception as e:
        print(f"❌ Error creating constraints: {e}")

# --- 6. Configure Graph Transformer with Clean Schema ---
allowed_nodes_schema = ["Role", "Technical skill", "Soft skill", "Tool", "Task"]
allowed_relationships_schema = [
    "REQUIRES_SKILL",      # Role -> Technical Skill or Soft Skill
    "USES_TOOL",          # Role -> Tool
    "PERFORMS_TASK",      # Role -> Task
    "INVOLVES_SKILL",     # Task -> Technical Skill or Soft Skill
    "REQUIRES_TOOL",      # Task -> Tool
    "REQUIRES_EXPERIENCE" # Role -> Technical Skill/Tool (when experience is mentioned)
]

CYPHER_GENERATION_TEMPLATE = f"""
You are an expert in analyzing job descriptions to extract competency requirements.
Focus ONLY on roles, tasks, skills, and tools - ignore company names and locations completely.

From the provided text, extract entities and relationships that conform to this schema:

Schema:
- Allowed Node Labels: {', '.join(allowed_nodes_schema)}
- Allowed Relationship Types: {', '.join(allowed_relationships_schema)}

Node Definitions:
- Role: Job titles or positions (e.g., "Data Scientist", "ML Engineer", "AI Researcher")
- Task: Specific job responsibilities or activities (e.g., "Build machine learning models", "Analyze data", "Deploy solutions")
- Technical Skill: Programming languages, frameworks, technical methodologies (e.g., "Python", "TensorFlow", "Machine Learning")
- Soft Skill: Interpersonal or general professional skills (e.g., "Communication", "Leadership", "Problem-solving")
- Tool: Specific software, platforms, or applications (e.g., "AWS", "Docker", "Tableau", "Git")

Relationship Guidelines:
- REQUIRES_SKILL: When a role requires a specific technical or soft skill
- USES_TOOL: When a role uses specific tools or platforms
- PERFORMS_TASK: When a role is responsible for specific tasks
- INVOLVES_SKILL: When a task requires or involves specific skills
- REQUIRES_TOOL: When a task requires specific tools
- REQUIRES_EXPERIENCE: When specific years of experience with a skill/tool is mentioned

Important:
1. DO NOT create nodes for company names, locations, or any other entity types
2. Extract the actual job title as Role, not generic terms
3. Be specific with skills and tools - use the exact names mentioned
4. Distinguish between skills (knowledge/ability) and tools (software/platforms)
5. Each node must have a valid, non-null 'id' property

Text:
{{input}}

Cypher Query:
"""

if llm and graph:
    try:
        prompt = PromptTemplate.from_template(CYPHER_GENERATION_TEMPLATE)
        graph_transformer = LLMGraphTransformer(
            llm=llm,
            prompt=prompt,
            allowed_nodes=allowed_nodes_schema,
            allowed_relationships=allowed_relationships_schema
        )
        print("✅ Graph transformer configured with clean schema")
    except Exception as e:
        print(f"❌ Error configuring graph transformer: {e}")
        graph_transformer = None
else:
    print("❌ Cannot configure graph transformer - missing LLM or Graph connection")
    graph_transformer = None

# --- 7. Final Status Report ---
print("\n" + "="*60)
print("📋 SETUP COMPLETE - STATUS REPORT")
print("="*60)
print(f"🤖 GPT-4o-mini: {'✅ Ready' if llm else '❌ Failed'}")
print(f"🗄️  Neo4j Graph: {'✅ Connected & Clean' if graph else '❌ Failed'}")
print(f"🔧 Graph Transformer: {'✅ Configured' if graph_transformer else '❌ Failed'}")
print(f"🧹 Database Status: {'✅ Completely Clean' if graph else '❌ Unknown'}")

if llm and graph and graph_transformer:
    print("\n🎉 ALL SYSTEMS READY! You can proceed to Cell 3.")
else:
    print("\n⚠️  Some components failed. Check errors above before proceeding.")

🚀 Starting Full Setup...
✅ OpenAI API key retrieved from Colab secrets
✅ GPT-4o-mini initialized successfully
✅ Neo4j connection established
✅ Neo4j connection verified: Connection successful

🧹 Starting COMPLETE database cleaning...
📊 Before cleaning: 0 nodes, 0 relationships
✅ Cleaning complete: 0 nodes, 0 relationships
🎉 Database is now completely clean and ready for fresh data!

🔧 Creating fresh data integrity constraints...
✅ Created constraint: (r:Role)
✅ Created constraint: (s:`Technical skill`)
✅ Created constraint: (s:`Soft skill`)
✅ Created constraint: (t:Tool)
✅ Created constraint: (t:Task)
✅ Graph transformer configured with clean schema

📋 SETUP COMPLETE - STATUS REPORT
🤖 GPT-4o-mini: ✅ Ready
🗄️  Neo4j Graph: ✅ Connected & Clean
🔧 Graph Transformer: ✅ Configured
🧹 Database Status: ✅ Completely Clean

🎉 ALL SYSTEMS READY! You can proceed to Cell 3.


**Cell 3: Load Your CSV Data**

Please replace the placeholder values with the path to your CSV file and the name of the column that contains the job description text.

In [4]:
# Cell 3: Load Test CSV Data (4 samples)

import pandas as pd
from google.colab import drive
from langchain_core.documents import Document

print("📁 Loading your test CSV data...")

# --- 1. Mount Google Drive (if not already mounted) ---
try:
    drive.mount('/content/drive', force_remount=False)
    print("✅ Google Drive mounted successfully")
except Exception as e:
    print(f"⚠️  Drive mount issue (might already be mounted): {e}")

# --- 2. Load your test CSV file ---
try:
    # Update this path to match where you uploaded Book1.csv
    csv_path = "/content/drive/MyDrive/knowledge-graph-llms/Book1.csv"  # Adjust path as needed

    df = pd.read_csv(csv_path)
    print(f"✅ CSV loaded successfully: {len(df)} rows, {len(df.columns)} columns")

    # Show column names
    print(f"📋 Columns: {list(df.columns)}")

except Exception as e:
    print(f"❌ Error loading CSV: {e}")
    print("🔧 Please check the file path. Common locations:")
    print("   - /content/drive/MyDrive/knowledge-graph-llms/Book1.csv")
    print("   - /content/drive/MyDrive/YourFolderName/Book1.csv")
    df = None

# --- 3. Examine the data structure ---
if df is not None:
    print("\n" + "="*60)
    print("📊 DATA OVERVIEW")
    print("="*60)

    # Show basic info
    print(f"Dataset shape: {df.shape}")
    print(f"Key columns for graph extraction:")
    print(f"  - Title: {df['Title'].notna().sum()}/{len(df)} non-null values")
    print(f"  - JobText: {df['JobText'].notna().sum()}/{len(df)} non-null values")

    # Show sample of the data
    print("\n📝 SAMPLE DATA:")
    for i, row in df.head(4).iterrows():
        print(f"\n--- Job {i+1} ---")
        print(f"Title: {row['Title']}")
        print(f"Employer: {row['Employer']}")
        print(f"JobText (first 200 chars): {str(row['JobText'])[:200]}...")
        print(f"JobText length: {len(str(row['JobText']))} characters")

# --- 4. Create Documents for Graph Transformer ---
documents = []
if df is not None:
    try:
        print("\n🔄 Converting to LangChain Document format...")

        for i, row in df.iterrows():
            # Combine title and job text for better context
            content = f"Job Title: {row['Title']}\n\nJob Description:\n{row['JobText']}"

            # Create metadata
            metadata = {
                'job_id': row['JobID'],
                'title': row['Title'],
                'employer': row['Employer'],
                'city': row['City'],
                'state': row['State'],
                'source': 'test_csv'
            }

            # Create LangChain Document
            doc = Document(
                page_content=content,
                metadata=metadata
            )
            documents.append(doc)

        print(f"✅ Created {len(documents)} documents for graph extraction")

        # Show sample document
        print(f"\n📋 SAMPLE DOCUMENT FORMAT:")
        print(f"Content preview: {documents[0].page_content[:300]}...")
        print(f"Metadata: {documents[0].metadata}")

    except Exception as e:
        print(f"❌ Error creating documents: {e}")
        documents = []

# --- 5. Validation and Ready Check ---
print("\n" + "="*60)
print("🔍 VALIDATION REPORT")
print("="*60)

validation_passed = True

# Check 1: Data loaded
if df is None:
    print("❌ CSV data not loaded")
    validation_passed = False
else:
    print(f"✅ CSV data loaded: {len(df)} rows")

# Check 2: Required columns exist
if df is not None:
    required_columns = ['Title', 'JobText']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"❌ Missing required columns: {missing_columns}")
        validation_passed = False
    else:
        print("✅ All required columns present")

# Check 3: Documents created
if len(documents) == 0:
    print("❌ No documents created for graph extraction")
    validation_passed = False
else:
    print(f"✅ Documents ready: {len(documents)} job postings")

# Check 4: Content quality
if documents:
    avg_length = sum(len(doc.page_content) for doc in documents) / len(documents)
    if avg_length < 100:
        print(f"⚠️  Warning: Average content length is short ({avg_length:.0f} chars)")
    else:
        print(f"✅ Content quality good: Average {avg_length:.0f} characters per job")

# Final status
print("\n" + "="*60)
if validation_passed:
    print("🎉 CELL 3 COMPLETE - DATA READY FOR GRAPH EXTRACTION!")
    print("✅ You can proceed to Cell 4 to create the knowledge graph")
    print(f"📊 Ready to process: {len(documents)} job postings")
else:
    print("⚠️  Please fix the issues above before proceeding to Cell 4")

print("="*60)

📁 Loading your test CSV data...
Mounted at /content/drive
✅ Google Drive mounted successfully
✅ CSV loaded successfully: 4 rows, 11 columns
📋 Columns: ['Employer', 'JobID', 'City', 'State', 'Title', 'JobDate', 'ANZSIC CODE', 'ANZSCO CODE', 'LOT CODE', 'JobText', 'JobUrl']

📊 DATA OVERVIEW
Dataset shape: (4, 11)
Key columns for graph extraction:
  - Title: 4/4 non-null values
  - JobText: 4/4 non-null values

📝 SAMPLE DATA:

--- Job 1 ---
Title: Analytics and Insights Managers
Employer: Mirvac
JobText (first 200 chars): * Company Mirvac Group -
    Description
    
    Our customer data is a valued asset that blends internal and external data sources, Voice of Customer feedback and systemically records customer intel...
JobText length: 1604 characters

--- Job 2 ---
Title: Automation Managers
Employer: Mirvac
JobText (first 200 chars): 1006
    
    Solution Engineer, Intelligent Process Automation
    
    Sydney, AU-NSW, Australia
    
    |
    
    req791
    
    
    
    Mirvac i

**Cell 4: Process Documents and Create Knowledge Graph**
Here's what Cell 4 will do:
Process:

Process all 4 jobs simultaneously using GPT-4o-mini
Extract entities and relationships for each job posting
Add nodes and relationships to your clean Neo4j database
Show summary statistics and sample extractions
Create a simple visualization of the resulting graph
Validate the graph structure is correct



In [5]:
# Cell 4: Process Documents and Create Knowledge Graph

import time
from collections import Counter
import json

print("🚀 Starting Knowledge Graph Extraction...")
print("📋 Processing 4 job postings with GPT-4o-mini")

# --- 1. Verify Prerequisites ---
prerequisites_check = True

if 'documents' not in locals() or len(documents) == 0:
    print("❌ No documents found. Please run Cell 3 first.")
    prerequisites_check = False

if 'graph_transformer' not in locals() or graph_transformer is None:
    print("❌ Graph transformer not available. Please run Cell 2 first.")
    prerequisites_check = False

if 'graph' not in locals() or graph is None:
    print("❌ Neo4j graph not available. Please run Cell 2 first.")
    prerequisites_check = False

if not prerequisites_check:
    print("⚠️  Cannot proceed. Please run previous cells first.")
else:
    print("✅ All prerequisites met. Starting extraction...")

# --- 2. Process Documents and Extract Graph ---
if prerequisites_check:
    try:
        print(f"\n🔄 Extracting knowledge graph from {len(documents)} job postings...")
        start_time = time.time()

        # Extract graph documents using the transformer
        graph_documents = graph_transformer.convert_to_graph_documents(documents)

        extraction_time = time.time() - start_time
        print(f"✅ Graph extraction completed in {extraction_time:.1f} seconds")
        print(f"📊 Generated {len(graph_documents)} graph documents")

    except Exception as e:
        print(f"❌ Error during graph extraction: {e}")
        graph_documents = []

# --- 3. Analyze Extracted Entities (Before Adding to Neo4j) ---
if graph_documents:
    print("\n" + "="*60)
    print("📊 EXTRACTION ANALYSIS")
    print("="*60)

    # Collect all entities and relationships
    all_nodes = []
    all_relationships = []

    for graph_doc in graph_documents:
        all_nodes.extend(graph_doc.nodes)
        all_relationships.extend(graph_doc.relationships)

    # Count by type
    node_types = Counter([node.type for node in all_nodes])
    relationship_types = Counter([rel.type for rel in all_relationships])

    print(f"📈 EXTRACTED ENTITIES:")
    print(f"  Total Nodes: {len(all_nodes)}")
    for node_type, count in node_types.most_common():
        print(f"    - {node_type}: {count}")

    print(f"\n🔗 EXTRACTED RELATIONSHIPS:")
    print(f"  Total Relationships: {len(all_relationships)}")
    for rel_type, count in relationship_types.most_common():
        print(f"    - {rel_type}: {count}")

    # Show sample entities
    print(f"\n📝 SAMPLE EXTRACTED ENTITIES:")
    for node_type in ['Role', 'Technical skill', 'Soft skill', 'Tool', 'Task']:
        sample_nodes = [node.id for node in all_nodes if node.type == node_type][:3]
        if sample_nodes:
            print(f"  {node_type}: {', '.join(sample_nodes)}")

# --- 4. Add Graph Documents to Neo4j ---
if graph_documents:
    try:
        print(f"\n💾 Adding extracted graph to Neo4j database...")

        # Add to Neo4j
        graph.add_graph_documents(graph_documents)

        print("✅ Graph successfully added to Neo4j!")

        # Verify what was added
        verification_query = """
        MATCH (n)
        RETURN labels(n)[0] as node_type, count(n) as count
        ORDER BY count DESC
        """

        db_stats = graph.query(verification_query)

        print(f"\n📊 NEO4J DATABASE STATISTICS:")
        total_nodes = sum([stat['count'] for stat in db_stats])
        print(f"  Total Nodes in Database: {total_nodes}")

        for stat in db_stats:
            print(f"    - {stat['node_type']}: {stat['count']}")

        # Count relationships
        rel_count_query = "MATCH ()-[r]-() RETURN count(r) as count"
        rel_count = graph.query(rel_count_query)[0]['count']
        print(f"  Total Relationships: {rel_count}")

    except Exception as e:
        print(f"❌ Error adding graph to Neo4j: {e}")

# --- 5. Sample Graph Queries for Validation ---
if graph_documents:
    print(f"\n🔍 SAMPLE GRAPH QUERIES:")

    try:
        # Query 1: Find all roles and their required skills
        roles_skills_query = """
        MATCH (r:Role)-[:REQUIRES_SKILL]->(s)
        WHERE labels(s)[0] IN ['Technical skill', 'Soft skill']
        RETURN r.id as role, labels(s)[0] as skill_type, s.id as skill
        LIMIT 10
        """

        roles_skills = graph.query(roles_skills_query)
        if roles_skills:
            print("  🎯 Roles and Required Skills:")
            for item in roles_skills[:5]:  # Show first 5
                print(f"    - {item['role']} requires {item['skill_type']}: {item['skill']}")

        # Query 2: Find all tools used by roles
        roles_tools_query = """
        MATCH (r:Role)-[:USES_TOOL]->(t:Tool)
        RETURN r.id as role, t.id as tool
        LIMIT 5
        """

        roles_tools = graph.query(roles_tools_query)
        if roles_tools:
            print("  🛠️  Roles and Tools:")
            for item in roles_tools:
                print(f"    - {item['role']} uses {item['tool']}")

        # Query 3: Find tasks and their required skills
        tasks_skills_query = """
        MATCH (t:Task)-[:INVOLVES_SKILL]->(s)
        WHERE labels(s)[0] IN ['Technical skill', 'Soft skill']
        RETURN t.id as task, s.id as skill
        LIMIT 5
        """

        tasks_skills = graph.query(tasks_skills_query)
        if tasks_skills:
            print("  📋 Tasks and Skills:")
            for item in tasks_skills:
                print(f"    - Task '{item['task']}' involves {item['skill']}")

    except Exception as e:
        print(f"⚠️  Error running sample queries: {e}")

# --- 6. Create Simple Graph Visualization ---
if graph_documents:
    try:
        print(f"\n🎨 Creating graph visualization...")

        # Get sample data for visualization (limited to avoid clutter)
        viz_query = """
        MATCH (n)-[r]->(m)
        RETURN n.id as source, labels(n)[0] as source_type,
               type(r) as relation,
               m.id as target, labels(m)[0] as target_type
        LIMIT 20
        """

        viz_data = graph.query(viz_query)

        if viz_data:
            # Create network visualization
            net = Network(height="400px", width="100%", bgcolor="#222222", font_color="white")

            # Add nodes and edges
            added_nodes = set()

            for item in viz_data:
                # Add source node
                if item['source'] not in added_nodes:
                    color = {
                        'Role': '#FF6B6B',
                        'Technical skill': '#4ECDC4',
                        'Soft skill': '#45B7D1',
                        'Tool': '#96CEB4',
                        'Task': '#FECA57'
                    }.get(item['source_type'], '#DDA0DD')

                    net.add_node(item['source'],
                                label=item['source'][:30] + ('...' if len(item['source']) > 30 else ''),
                                color=color,
                                title=f"{item['source_type']}: {item['source']}")
                    added_nodes.add(item['source'])

                # Add target node
                if item['target'] not in added_nodes:
                    color = {
                        'Role': '#FF6B6B',
                        'Technical skill': '#4ECDC4',
                        'Soft skill': '#45B7D1',
                        'Tool': '#96CEB4',
                        'Task': '#FECA57'
                    }.get(item['target_type'], '#DDA0DD')

                    net.add_node(item['target'],
                                label=item['target'][:30] + ('...' if len(item['target']) > 30 else ''),
                                color=color,
                                title=f"{item['target_type']}: {item['target']}")
                    added_nodes.add(item['target'])

                # Add edge
                net.add_edge(item['source'], item['target'],
                            label=item['relation'],
                            title=item['relation'])

            # Save and display
            net.save_graph("/content/knowledge_graph_sample.html")
            print("✅ Graph visualization saved as 'knowledge_graph_sample.html'")
            print("🔗 You can download and open this file to view the interactive graph")

        else:
            print("⚠️  No relationships found for visualization")

    except Exception as e:
        print(f"⚠️  Visualization creation failed: {e}")

# --- 7. Final Summary Report ---
print("\n" + "="*60)
print("🎉 CELL 4 COMPLETE - KNOWLEDGE GRAPH CREATED!")
print("="*60)

if graph_documents:
    print(f"✅ Successfully processed {len(documents)} job postings")
    print(f"✅ Extracted {len(all_nodes)} entities and {len(all_relationships)} relationships")
    print(f"✅ Added all data to Neo4j database")
    print(f"✅ Created sample visualization")

    print(f"\n📊 QUICK STATS:")
    print(f"  - Roles: {node_types.get('Role', 0)}")
    print(f"  - Technical Skills: {node_types.get('Technical skill', 0)}")
    print(f"  - Soft Skills: {node_types.get('Soft skill', 0)}")
    print(f"  - Tools: {node_types.get('Tool', 0)}")
    print(f"  - Tasks: {node_types.get('Task', 0)}")

    print(f"\n🎯 NEXT STEPS:")
    print(f"  1. Review the sample queries above")
    print(f"  2. Download and view the graph visualization")
    print(f"  3. Test with larger datasets if results look good")
    print(f"  4. Run your LLM classification analysis")

else:
    print("❌ Graph creation failed. Check errors above.")

print("="*60)

🚀 Starting Knowledge Graph Extraction...
📋 Processing 4 job postings with GPT-4o-mini
✅ All prerequisites met. Starting extraction...

🔄 Extracting knowledge graph from 4 job postings...
✅ Graph extraction completed in 66.9 seconds
📊 Generated 4 graph documents

📊 EXTRACTION ANALYSIS
📈 EXTRACTED ENTITIES:
  Total Nodes: 73
    - Task: 24
    - Technical skill: 17
    - Tool: 15
    - Soft skill: 13
    - Role: 4

🔗 EXTRACTED RELATIONSHIPS:
  Total Relationships: 83
    - REQUIRES_SKILL: 26
    - PERFORMS_TASK: 24
    - INVOLVES_SKILL: 12
    - REQUIRES_TOOL: 11
    - USES_TOOL: 10

📝 SAMPLE EXTRACTED ENTITIES:
  Role: Analytics And Insights Manager, Automation Manager, Mechanical Service Technician
  Technical skill: Predictive Analytics, Machine Learning, C#
  Soft skill: Communication, Project Management, Attention To Detail
  Tool: Google Analytics, Tag Manager, Webmaster Tools
  Task: Market Research, Customer Data Analysis, Insight Generation

💾 Adding extracted graph to Neo4j dat

**Cell 5A: Pure LLM Construction Industry Classification**

In [6]:
# Cell 5A: GPT-4o Enhanced Classification (Fixed Syntax Error)

import pandas as pd
import time
import json
from datetime import datetime
from langchain_openai import ChatOpenAI

print("🧠 GPT-4o ENHANCED CONSTRUCTION + AI/ML ECOSYSTEM CLASSIFICATION")
print("🎯 Research Focus: AI/Automation/Data Science in Construction")
print("="*70)

# --- 1. Setup Enhanced GPT-4o for Classification ---
try:
    from google.colab import userdata
    openai_api_key = userdata.get('OPENAI_API_KEY')
    print("✅ OpenAI API key retrieved")
except:
    print("❌ Could not retrieve OpenAI API key")
    openai_api_key = None

if openai_api_key:
    try:
        # Upgrade to GPT-4o for superior reasoning
        llm_classification = ChatOpenAI(
            model="gpt-4o",  # Enhanced reasoning model
            temperature=0,   # Deterministic results
            openai_api_key=openai_api_key,
            max_tokens=150   # Enough for reasoning + classification
        )
        print("✅ GPT-4o initialized for enhanced classification")
    except Exception as e:
        print(f"❌ Error initializing GPT-4o: {e}")
        llm_classification = None
else:
    print("❌ OpenAI API key not found")
    llm_classification = None

# --- 2. Load Dataset ---
CSV_PATH = "/content/drive/MyDrive/knowledge-graph-llms/data_all.csv"
ENHANCED_CSV_PATH = "/content/drive/MyDrive/knowledge-graph-llms/construction_ai_research_dataset.csv"
ENHANCED_LOG_PATH = "/content/drive/MyDrive/knowledge-graph-llms/gpt4o_classification_log.json"

print(f"\n📁 Loading dataset: {CSV_PATH}")

try:
    df_full = pd.read_csv(CSV_PATH)
    print(f"✅ Dataset loaded: {len(df_full)} jobs")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    df_full = None

# --- 3. Enhanced Classification Function ---
def classify_job_research_relevance(job_title, job_text, employer, job_id):
    """
    Use GPT-4o's superior reasoning to classify job relevance
    """

    content = f"Job Title: {job_title}\nEmployer: {employer}\n\nJob Description:\n{job_text}"

    prompt = f"""You are an expert researcher studying AI, machine learning, automation, and data science adoption in the construction industry ecosystem.

Your research question: "How are AI/ML/automation technologies transforming roles across the construction industry?"

Classify this job based on its relevance to this research:

RELEVANT includes:
1. Direct construction roles (may evolve with AI/automation)
2. Tech roles in construction companies (data scientists, AI specialists, automation engineers)
3. Support functions with AI potential (finance, HR, operations that may use AI/ML)
4. Construction ecosystem roles that could be impacted by digital transformation
5. Emerging hybrid roles combining traditional skills with tech

NOT RELEVANT:
- Healthcare/medical roles unrelated to construction
- Pure retail/hospitality with no construction connection
- Financial services unrelated to construction industry

Job to classify:
{content}

Provide brief reasoning, then classify as RELEVANT or NOT_RELEVANT.

Format:
REASONING: [Your reasoning]
CLASSIFICATION: RELEVANT or NOT_RELEVANT"""

    try:
        response = llm_classification.invoke(prompt)
        response_text = response.content.strip()

        # Parse response
        reasoning = ""
        classification = ""

        if 'REASONING:' in response_text and 'CLASSIFICATION:' in response_text:
            parts = response_text.split('CLASSIFICATION:')
            reasoning = parts[0].replace('REASONING:', '').strip()
            classification = parts[1].strip().upper()
        else:
            # Fallback parsing
            if 'RELEVANT' in response_text.upper() and 'NOT_RELEVANT' not in response_text.upper():
                classification = 'RELEVANT'
            elif 'NOT_RELEVANT' in response_text.upper():
                classification = 'NOT_RELEVANT'
            else:
                classification = 'UNCLEAR'
            reasoning = response_text[:200] + "..." if len(response_text) > 200 else response_text

        return classification, reasoning

    except Exception as e:
        print(f"❌ Error for {job_id}: {e}")
        return "ERROR", str(e)

# --- 4. Perform Classification ---
if df_full is not None and llm_classification is not None:
    print(f"\n🧠 Starting GPT-4o enhanced classification...")
    print(f"🎯 Research Focus: Construction + AI/ML/Automation ecosystem")
    print(f"⏱️  Estimated time: {len(df_full) * 4 / 60:.1f} minutes")
    print(f"💰 Estimated cost: ${len(df_full) * 0.06:.2f}")

    # Initialize tracking
    classification_results = []
    relevant_jobs = []
    not_relevant_jobs = []
    error_jobs = []

    start_time = time.time()

    # Process each job
    for index, row in df_full.iterrows():
        job_id = row['JobID']
        job_title = row['Title']
        job_text = str(row['JobText'])
        employer = row.get('Employer', 'Unknown')

        # Show progress
        if (index + 1) % 15 == 0:
            elapsed = time.time() - start_time
            avg_time_per_job = elapsed / (index + 1)
            remaining = (len(df_full) - index - 1) * avg_time_per_job
            print(f"   Progress: {index + 1}/{len(df_full)} ({(index + 1)/len(df_full)*100:.1f}%) - Remaining: {remaining/60:.1f}min")

        # Classify with reasoning
        classification, reasoning = classify_job_research_relevance(job_title, job_text, employer, job_id)

        # Store results
        result = {
            'index': index,
            'job_id': job_id,
            'title': job_title,
            'employer': employer,
            'classification': classification,
            'reasoning': reasoning,
            'text_length': len(job_text)
        }
        classification_results.append(result)

        # Categorize
        if classification == "RELEVANT":
            relevant_jobs.append(row)
        elif classification == "NOT_RELEVANT":
            not_relevant_jobs.append(row)
        else:
            error_jobs.append(row)

        # Rate limiting
        time.sleep(0.8)

    total_time = time.time() - start_time

    # --- 5. Results Analysis ---
    print(f"\n" + "="*70)
    print(f"🧠 GPT-4o CLASSIFICATION COMPLETE!")
    print(f"="*70)
    print(f"⏱️  Total time: {total_time/60:.1f} minutes")
    print(f"💰 Cost: ${len(df_full) * 0.06:.2f}")
    print(f"\n📊 RESEARCH RESULTS:")
    print(f"   ✅ Research relevant jobs: {len(relevant_jobs)}")
    print(f"   ❌ Not relevant jobs: {len(not_relevant_jobs)}")
    print(f"   ⚠️  Error/unclear jobs: {len(error_jobs)}")
    print(f"   📈 Research relevance rate: {len(relevant_jobs)/len(df_full)*100:.1f}%")

    # --- 6. Show Sample Reasoning ---
    print(f"\n🧠 SAMPLE REASONING:")

    relevant_samples = [r for r in classification_results if r['classification'] == 'RELEVANT'][:3]
    print(f"\n✅ RELEVANT JOBS:")
    for i, sample in enumerate(relevant_samples, 1):
        print(f"{i}. {sample['title']} at {sample['employer']}")
        print(f"   💭 {sample['reasoning'][:100]}...")
        print()

    not_relevant_samples = [r for r in classification_results if r['classification'] == 'NOT_RELEVANT'][:2]
    print(f"❌ NOT RELEVANT JOBS:")
    for i, sample in enumerate(not_relevant_samples, 1):
        print(f"{i}. {sample['title']} at {sample['employer']}")
        print(f"   💭 {sample['reasoning'][:100]}...")
        print()

    # --- 7. Save Research Dataset ---
    if relevant_jobs:
        try:
            research_df = pd.DataFrame(relevant_jobs)
            research_df.to_csv(ENHANCED_CSV_PATH, index=False)
            print(f"💾 RESEARCH DATASET SAVED:")
            print(f"   📁 Path: {ENHANCED_CSV_PATH}")
            print(f"   📊 Size: {len(research_df)} research-relevant jobs")

        except Exception as e:
            print(f"❌ Error saving dataset: {e}")

    # --- 8. Save Classification Log ---
    try:
        enhanced_log = {
            'timestamp': datetime.now().isoformat(),
            'model_used': 'gpt-4o',
            'total_jobs': len(df_full),
            'relevant_jobs': len(relevant_jobs),
            'not_relevant_jobs': len(not_relevant_jobs),
            'error_jobs': len(error_jobs),
            'processing_time_minutes': total_time / 60,
            'relevance_rate': len(relevant_jobs)/len(df_full)*100,
            'sample_results': classification_results[:20]  # First 20 with reasoning
        }

        with open(ENHANCED_LOG_PATH, 'w') as f:
            json.dump(enhanced_log, f, indent=2)

        print(f"📋 Classification log saved")

    except Exception as e:
        print(f"⚠️  Error saving log: {e}")

    # --- 9. Ready for Next Step ---
    print(f"\n" + "="*70)
    print(f"🎉 ENHANCED RESEARCH DATASET READY!")
    print(f"="*70)
    print(f"🧠 GPT-4o reasoning-based classification completed")
    print(f"🎯 {len(relevant_jobs)} jobs relevant to construction + AI/ML research")
    print(f"📁 Dataset ready for knowledge graph extraction")
    print(f"\n🚀 NEXT: Run Cell 5B to load the research dataset")
    print(f"="*70)

else:
    print("❌ Cannot proceed. Check prerequisites.")

🧠 GPT-4o ENHANCED CONSTRUCTION + AI/ML ECOSYSTEM CLASSIFICATION
🎯 Research Focus: AI/Automation/Data Science in Construction
✅ OpenAI API key retrieved
✅ GPT-4o initialized for enhanced classification

📁 Loading dataset: /content/drive/MyDrive/knowledge-graph-llms/data_all.csv
✅ Dataset loaded: 300 jobs

🧠 Starting GPT-4o enhanced classification...
🎯 Research Focus: Construction + AI/ML/Automation ecosystem
⏱️  Estimated time: 20.0 minutes
💰 Estimated cost: $18.00
   Progress: 15/300 (5.0%) - Remaining: 20.0min
   Progress: 30/300 (10.0%) - Remaining: 19.0min
   Progress: 45/300 (15.0%) - Remaining: 18.7min
   Progress: 60/300 (20.0%) - Remaining: 17.8min
   Progress: 75/300 (25.0%) - Remaining: 16.6min
   Progress: 90/300 (30.0%) - Remaining: 15.6min
   Progress: 105/300 (35.0%) - Remaining: 14.5min
   Progress: 120/300 (40.0%) - Remaining: 13.4min
   Progress: 135/300 (45.0%) - Remaining: 12.3min
   Progress: 150/300 (50.0%) - Remaining: 11.2min
   Progress: 165/300 (55.0%) - Remaini

In [7]:
# Show all NOT RELEVANT jobs with reasoning
not_relevant = [r for r in classification_results if r['classification'] == 'NOT_RELEVANT']
not_relevant_df = pd.DataFrame(not_relevant)
display(not_relevant_df[['job_id', 'title', 'employer', 'reasoning']])


Unnamed: 0,job_id,title,employer,reasoning
0,baf042245eb8b6edf6a67344793bd3147430bf80,Data Scientists,Lendlease,The job description for the Senior Data Scient...
1,d22b564d6d79b60ca19e5c931c5dd67c1d7da31b,,Heb Construction,The job description for the Kaikokiri - Te Reo...
2,1a813523f31150ae46f668e4d970f0e335f4af70,Patient Flow Managers,Kingswood Co.,The job description is for a skin cancer docto...
3,f8610fbd97a172d855732f30cbb54988963bef2e,Patient Flow Managers,Kingswood Co.,The job description provided is for a Patient ...
4,232d63479d9fb538034f9fb3afa03e878a98326c,Digital Marketing Account Executives,Australiance,The job of a Digital Marketing Account Executi...
5,8acee2ce6061e06c0194b21e262a75da013d18ed,Customer Service Team Leads,Laminex,The job description for the Customer Service T...
6,e78dd956d2dd44d2484bd2f995aa228fb6c49a05,Quantitative Trading Interns,Ventia,The job description for the Quantitative Tradi...
7,11ca65ea546a222b24fac4fa19d347233d26cd4a,Quantitative Trading Interns,The Infinity Group,The job description for the Quantitative Tradi...
8,4991352d4f812ee8f839ad10e832043962a69a6c,Quantitative Trading Interns,Clough,The job description for the Quantitative Tradi...
9,218e436e768ad49a122238d45f95eb7ef8524165,Marketing Coordinators,Mayfield And Co,The job description for the Marketing Coordina...


In [9]:
# Get reasoning for flagged jobs
flagged = [r for r in classification_results if r['job_id'] in job_ids]
for job in flagged:
    print(f"JobID: {job['job_id']}")
    print(f"Title: {job['title']}")
    print(f"Employer: {job['employer']}")
    print(f"Reasoning: {job['reasoning']}\n{'-'*50}\n")


JobID: baf042245eb8b6edf6a67344793bd3147430bf80
Title: Data Scientists
Employer: Lendlease
Reasoning: The job description for the Senior Data Scientist position at La Trobe University does not indicate any direct connection to the construction industry. The role is focused on using data science skills to improve student outcomes and university strategies, which are not related to construction. Therefore, it does not fall under any of the relevant categories for the research question about AI/ML/automation technologies transforming roles in the construction industry.
--------------------------------------------------

JobID: d22b564d6d79b60ca19e5c931c5dd67c1d7da31b
Title: nan
Employer: Heb Construction
Reasoning: The job description for the Kaikokiri - Te Reo me nga Tikanga Maori role at HEB Construction is primarily focused on cultural and language revitalization, stakeholder communications, and learning and development within the context of a construction project. While it involves si

In [None]:
# Quick check of the Lendlease Data Scientist job
df_research = pd.read_csv('/content/drive/MyDrive/knowledge-graph-llms/construction_ai_research_dataset.csv')
df_full = pd.read_csv('/content/drive/MyDrive/knowledge-graph-llms/data_all.csv')

# Find the Lendlease Data Scientist job that was excluded
lendlease_jobs = df_full[df_full['Employer'].str.contains('Lendlease', na=False, case=False)]
print("Lendlease jobs in original dataset:")
for i, row in lendlease_jobs.iterrows():
    in_research = row['JobID'] in df_research['JobID'].values
    status = "✅ INCLUDED" if in_research else "❌ EXCLUDED"
    print(f"{status} {row['Title']} - {row['Employer']}")

Lendlease jobs in original dataset:
✅ INCLUDED Analytics Developers - Lendlease
✅ INCLUDED Data Support Analysts - Lendlease
❌ EXCLUDED Data Scientists - Lendlease
✅ INCLUDED nan - Lendlease
✅ INCLUDED Workforce Managers - Lendlease
✅ INCLUDED Paralegals - Lendlease
✅ INCLUDED Sourcing Analysts - Lendlease
✅ INCLUDED Automation Engineers - Lendlease
✅ INCLUDED Digital Marketing Analysts - Lendlease
✅ INCLUDED Marketing Performance Analysts - Lendlease


In [None]:
# Investigate the Lendlease and "nan" employer issues

import pandas as pd
import json

print("🔍 INVESTIGATING DATA QUALITY ISSUES")
print("="*50)

# Load datasets
df_full = pd.read_csv('/content/drive/MyDrive/knowledge-graph-llms/data_all.csv')
df_research = pd.read_csv('/content/drive/MyDrive/knowledge-graph-llms/construction_ai_research_dataset.csv')

# Load classification log with reasoning
with open('/content/drive/MyDrive/knowledge-graph-llms/gpt4o_classification_log.json', 'r') as f:
    log_data = json.load(f)

print("1. CHECKING LENDLEASE JOBS:")
print("-" * 30)

# Find all Lendlease jobs
lendlease_jobs = df_full[df_full['Employer'].str.contains('Lendlease', na=False, case=False)]
print(f"Total Lendlease jobs found: {len(lendlease_jobs)}")

for i, row in lendlease_jobs.iterrows():
    in_research = row['JobID'] in df_research['JobID'].values
    status = "✅ INCLUDED" if in_research else "❌ EXCLUDED"
    print(f"{status} {row['Title']} at {row['Employer']}")

    # Show first 100 chars of job description for excluded ones
    if not in_research:
        print(f"   📝 Job description start: {str(row['JobText'])[:100]}...")

print("\n2. CHECKING 'nan' EMPLOYER ISSUES:")
print("-" * 30)

# Find jobs with nan/null employers
nan_employers = df_full[df_full['Employer'].isna() | (df_full['Employer'] == 'nan')]
print(f"Jobs with nan/null employers: {len(nan_employers)}")

for i, row in nan_employers.iterrows():
    in_research = row['JobID'] in df_research['JobID'].values
    status = "✅ INCLUDED" if in_research else "❌ EXCLUDED"
    employer_display = 'NULL/NaN' if pd.isna(row['Employer']) else row['Employer']
    print(f"{status} {row['Title']} at {employer_display}")

print("\n3. CHECKING NOT_RELEVANT REASONING:")
print("-" * 30)

# Look at the specific reasoning for excluded jobs
excluded_jobs = []
for result in log_data.get('sample_results', []):
    if result['classification'] == 'NOT_RELEVANT':
        excluded_jobs.append(result)

print("Sample NOT_RELEVANT jobs with reasoning:")
for i, job in enumerate(excluded_jobs[:5], 1):
    print(f"\n{i}. {job['title']} at {job['employer']}")
    print(f"   💭 Reasoning: {job['reasoning']}")

print("\n4. LOOKING FOR DATA CORRUPTION:")
print("-" * 30)

# Check if there's data mixing/corruption
problematic_jobs = df_full[
    (df_full['Title'].str.contains('Data Scientist', na=False, case=False)) &
    (df_full['Employer'].str.contains('Lendlease', na=False, case=False))
]

print(f"Data Scientist + Lendlease jobs: {len(problematic_jobs)}")
for i, row in problematic_jobs.iterrows():
    print(f"JobID: {row['JobID']}")
    print(f"Title: {row['Title']}")
    print(f"Employer: {row['Employer']}")
    print(f"Job text start: {str(row['JobText'])[:200]}...")
    print()

🔍 INVESTIGATING DATA QUALITY ISSUES
1. CHECKING LENDLEASE JOBS:
------------------------------
Total Lendlease jobs found: 10
✅ INCLUDED Analytics Developers at Lendlease
✅ INCLUDED Data Support Analysts at Lendlease
❌ EXCLUDED Data Scientists at Lendlease
   📝 Job description start: Sign In / Sign up
    Job Seeker
    
    Sign up
    
    Post your resume
    
    Employer
    
 ...
✅ INCLUDED nan at Lendlease
✅ INCLUDED Workforce Managers at Lendlease
✅ INCLUDED Paralegals at Lendlease
✅ INCLUDED Sourcing Analysts at Lendlease
✅ INCLUDED Automation Engineers at Lendlease
✅ INCLUDED Digital Marketing Analysts at Lendlease
✅ INCLUDED Marketing Performance Analysts at Lendlease

2. CHECKING 'nan' EMPLOYER ISSUES:
------------------------------
Jobs with nan/null employers: 0

3. CHECKING NOT_RELEVANT REASONING:
------------------------------
Sample NOT_RELEVANT jobs with reasoning:

1. Data Scientists at Lendlease
   💭 Reasoning: The job description for the Senior Data Scientist posit

**Cell 5B: Load Research Dataset and Setup Batching**
Now let's create Cell 5B to load your clean 282-job dataset and set up efficient batching for graph extraction:

In [4]:
# Cell 5B: Load Research Dataset and Setup Batching
# Load the GPT-4o validated construction + AI/ML dataset

import pandas as pd
import os
import json
from datetime import datetime
from langchain_core.documents import Document

print("🏗️ LOADING RESEARCH DATASET AND SETTING UP BATCHING")
print("🧠 GPT-4o Validated: Construction + AI/ML Ecosystem")
print("="*65)

# --- 1. Load the Research Dataset ---
RESEARCH_CSV_PATH = "/content/drive/MyDrive/knowledge-graph-llms/construction_ai_research_dataset.csv"
RESEARCH_CHECKPOINT_FOLDER = "/content/drive/MyDrive/knowledge-graph-llms/Research extraction batches"

print(f"📁 Loading research dataset: {RESEARCH_CSV_PATH}")

try:
    df_research = pd.read_csv(RESEARCH_CSV_PATH)
    print(f"✅ Research dataset loaded: {len(df_research)} jobs")
    print(f"   Columns: {list(df_research.columns)}")

    # Check required columns
    required_columns = ['Title', 'JobText', 'JobID']
    missing_columns = [col for col in required_columns if col not in df_research.columns]

    if missing_columns:
        print(f"❌ Missing required columns: {missing_columns}")
        df_research = None
    else:
        print("✅ All required columns present")

except Exception as e:
    print(f"❌ Error loading research dataset: {e}")
    df_research = None

# --- 2. Create Research Checkpoint Folder ---
try:
    os.makedirs(RESEARCH_CHECKPOINT_FOLDER, exist_ok=True)
    print(f"✅ Research checkpoint folder ready: {RESEARCH_CHECKPOINT_FOLDER}")
except Exception as e:
    print(f"❌ Error creating checkpoint folder: {e}")

# --- 3. Research Dataset Quality Analysis ---
if df_research is not None:
    print(f"\n🔍 RESEARCH DATASET ANALYSIS:")

    # Basic quality metrics
    null_titles = df_research['Title'].isnull().sum()
    null_jobtext = df_research['JobText'].isnull().sum()
    null_jobids = df_research['JobID'].isnull().sum()

    print(f"   Data completeness:")
    print(f"   - Title: {len(df_research) - null_titles}/{len(df_research)} complete")
    print(f"   - JobText: {len(df_research) - null_jobtext}/{len(df_research)} complete")
    print(f"   - JobID: {len(df_research) - null_jobids}/{len(df_research)} complete")

    # Clean any remaining null JobText
    if null_jobtext > 0:
        df_clean = df_research.dropna(subset=['JobText'])
        print(f"   🧹 Removed {len(df_research) - len(df_clean)} jobs with null JobText")
        df_research = df_clean

    # JobText quality analysis
    jobtext_lengths = df_research['JobText'].str.len()
    print(f"\n   JobText quality:")
    print(f"   - Average length: {jobtext_lengths.mean():.0f} characters")
    print(f"   - Minimum length: {jobtext_lengths.min():.0f} characters")
    print(f"   - Maximum length: {jobtext_lengths.max():.0f} characters")

    # Check for very short descriptions
    short_jobs = (jobtext_lengths < 100).sum()
    if short_jobs > 0:
        print(f"   ⚠️  {short_jobs} jobs with short descriptions (<100 chars)")

    # Show sample of research-relevant jobs
    print(f"\n📝 SAMPLE RESEARCH JOBS:")
    sample_size = min(5, len(df_research))
    for i in range(sample_size):
        row = df_research.iloc[i]
        employer = row.get('Employer', 'Unknown')
        print(f"   • {row['Title']} at {employer} ({len(str(row['JobText']))} chars)")

# --- 4. Calculate Optimal Batching ---
BATCH_SIZE = 20  # Smaller batches for research dataset
research_batches = []
research_batch_info = []

if df_research is not None:
    print(f"\n📦 CREATING RESEARCH BATCHES:")
    print(f"   Batch size: {BATCH_SIZE} jobs per batch")

    total_jobs = len(df_research)
    num_batches = (total_jobs + BATCH_SIZE - 1) // BATCH_SIZE  # Ceiling division

    for i in range(num_batches):
        start_idx = i * BATCH_SIZE
        end_idx = min((i + 1) * BATCH_SIZE, total_jobs)

        batch_df = df_research.iloc[start_idx:end_idx].copy()
        research_batches.append(batch_df)

        research_batch_info.append({
            'batch_number': i + 1,
            'start_index': start_idx,
            'end_index': end_idx - 1,
            'job_count': len(batch_df),
            'status': 'pending'
        })

        print(f"   Batch {i+1}: Jobs {start_idx+1}-{end_idx} ({len(batch_df)} jobs)")

    print(f"✅ Created {num_batches} research batches totaling {total_jobs} jobs")

# --- 5. Research Batch Processing Functions ---
def convert_research_batch_to_documents(batch_df, batch_number):
    """Convert research batch to LangChain Documents"""
    documents = []

    for i, row in batch_df.iterrows():
        # Enhanced content for research focus
        content = f"Job Title: {row['Title']}\n\nJob Description:\n{row['JobText']}"

        # Create metadata with research context
        metadata = {
            'job_id': row['JobID'],
            'title': row['Title'],
            'batch_number': batch_number,
            'original_index': i,
            'dataset_type': 'research_validated',
            'classification': 'construction_ai_ml_relevant'
        }

        # Add employer and location if available
        for col in ['Employer', 'City', 'State']:
            if col in row and pd.notna(row[col]):
                metadata[col.lower()] = row[col]

        # Create LangChain Document
        doc = Document(page_content=content, metadata=metadata)
        documents.append(doc)

    return documents

def save_research_checkpoint(batch_number, batch_info, extracted_stats=None):
    """Save research batch progress"""
    checkpoint_data = {
        'timestamp': datetime.now().isoformat(),
        'dataset_type': 'research_construction_ai_ml',
        'batch_number': batch_number,
        'total_batches': len(batch_info),
        'batch_info': batch_info,
        'extracted_stats': extracted_stats
    }

    checkpoint_file = os.path.join(RESEARCH_CHECKPOINT_FOLDER, f"research_checkpoint_batch_{batch_number}.json")

    try:
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        print(f"✅ Research checkpoint saved: batch_{batch_number}.json")
        return True
    except Exception as e:
        print(f"❌ Error saving research checkpoint: {e}")
        return False

def load_latest_research_checkpoint():
    """Load latest research checkpoint"""
    try:
        checkpoint_files = [f for f in os.listdir(RESEARCH_CHECKPOINT_FOLDER)
                          if f.startswith('research_checkpoint_batch_')]
        if not checkpoint_files:
            return None

        # Find latest checkpoint
        latest_batch = max([int(f.split('_')[-1].split('.')[0]) for f in checkpoint_files])
        latest_file = os.path.join(RESEARCH_CHECKPOINT_FOLDER, f"research_checkpoint_batch_{latest_batch}.json")

        with open(latest_file, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"⚠️  Error loading research checkpoint: {e}")
        return None

# --- 6. Check for Existing Research Progress ---
existing_research_checkpoint = load_latest_research_checkpoint()
if existing_research_checkpoint:
    print(f"\n🔄 EXISTING RESEARCH PROGRESS FOUND:")
    print(f"   Last completed batch: {existing_research_checkpoint['batch_number']}")
    print(f"   Timestamp: {existing_research_checkpoint['timestamp']}")

    completed_batches = existing_research_checkpoint['batch_number']
    remaining_batches = len(research_batches) - completed_batches
    print(f"   Remaining batches: {remaining_batches}")
else:
    print(f"\n🆕 NO EXISTING RESEARCH PROGRESS - Starting fresh")

# --- 7. Research Processing Summary ---
print("\n" + "="*65)
print("📋 RESEARCH BATCHING SETUP COMPLETE")
print("="*65)

if df_research is not None and research_batches:
    print(f"✅ Research dataset loaded: {len(df_research)} GPT-4o validated jobs")
    print(f"✅ Research batches created: {len(research_batches)} batches of {BATCH_SIZE} jobs each")
    print(f"✅ Research checkpoint system ready")
    print(f"✅ Save location: {RESEARCH_CHECKPOINT_FOLDER}")

    print(f"\n📊 RESEARCH PROCESSING ESTIMATES:")
    estimated_time_per_batch = 12.4 * BATCH_SIZE  # Based on previous performance
    print(f"   Time per batch: ~{estimated_time_per_batch/60:.1f} minutes")
    print(f"   Total estimated time: ~{(estimated_time_per_batch * len(research_batches))/60:.1f} minutes")

    print(f"\n🎯 RESEARCH FOCUS:")
    print(f"   🏗️  Construction ecosystem jobs")
    print(f"   🤖 AI/ML/automation relevant roles")
    print(f"   🧠 GPT-4o reasoning validated")
    print(f"   📊 94% relevance rate achieved")

    print(f"\n🚀 NEXT STEPS:")
    print(f"   1. Run Cell 6 to process research batches")
    print(f"   2. Enhanced graph extraction with clean data")
    print(f"   3. Focus on AI/ML transformation in construction")

    # Store variables for Cell 6
    print(f"\n📦 VARIABLES READY FOR CELL 6:")
    print(f"   - research_batches: {len(research_batches)} batch DataFrames")
    print(f"   - research_batch_info: Tracking information")
    print(f"   - Research checkpoint functions: Available")

else:
    print("❌ Research setup failed. Check errors above.")

print("="*65)

# Make research variables available for next cell
if df_research is not None and research_batches:
    # Create research batch status display
    print(f"\n📊 RESEARCH BATCH STATUS:")
    for i, info in enumerate(research_batch_info):
        status_icon = "⏳" if info['status'] == 'pending' else "✅"
        print(f"   {status_icon} Research Batch {info['batch_number']}: Jobs {info['start_index']+1}-{info['end_index']+1} ({info['job_count']} jobs)")

    print(f"\n🧠 Ready for research-focused graph extraction!")
    print(f"🎯 High-quality dataset: Construction + AI/ML ecosystem")

# Rename variables for clarity in Cell 6
batches = research_batches
batch_info = research_batch_info
convert_batch_to_documents = convert_research_batch_to_documents
save_batch_checkpoint = save_research_checkpoint

🏗️ LOADING RESEARCH DATASET AND SETTING UP BATCHING
🧠 GPT-4o Validated: Construction + AI/ML Ecosystem
📁 Loading research dataset: /content/drive/MyDrive/knowledge-graph-llms/construction_ai_research_dataset.csv
✅ Research dataset loaded: 283 jobs
   Columns: ['Employer', 'JobID', 'City', 'State', 'Title', 'JobDate', 'ANZSIC CODE', 'ANZSCO CODE', 'LOT CODE', 'JobText', 'JobUrl']
✅ All required columns present
✅ Research checkpoint folder ready: /content/drive/MyDrive/knowledge-graph-llms/Research extraction batches

🔍 RESEARCH DATASET ANALYSIS:
   Data completeness:
   - Title: 271/283 complete
   - JobText: 283/283 complete
   - JobID: 283/283 complete

   JobText quality:
   - Average length: 4365 characters
   - Minimum length: 96 characters
   - Maximum length: 11672 characters
   ⚠️  1 jobs with short descriptions (<100 chars)

📝 SAMPLE RESEARCH JOBS:
   • Analytics and Insights Managers at Mirvac (1604 chars)
   • Automation Managers at Mirvac (3979 chars)
   • Mechanical Service

In [8]:
# Checkpoint Folder Investigation Script
# Read-only analysis to understand current state

import os
import json
from datetime import datetime
import pandas as pd

print("🔍 INVESTIGATING CHECKPOINT FOLDER")
print("📁 Read-only analysis of existing files")
print("="*60)

# Define checkpoint folder path
CHECKPOINT_FOLDER = "/content/drive/MyDrive/knowledge-graph-llms/Research extraction batches"

# --- 1. Check if folder exists and list contents ---
print(f"📂 Checkpoint folder: {CHECKPOINT_FOLDER}")

try:
    if os.path.exists(CHECKPOINT_FOLDER):
        print("✅ Folder exists")

        # Get all files in the folder
        all_files = os.listdir(CHECKPOINT_FOLDER)

        if not all_files:
            print("📭 Folder is empty")
        else:
            print(f"📊 Found {len(all_files)} files")

            # --- 2. Categorize files by type ---
            file_categories = {
                'checkpoint_files': [],
                'extraction_files': [],
                'summary_files': [],
                'other_files': []
            }

            for file in all_files:
                if file.startswith('research_checkpoint_batch_'):
                    file_categories['checkpoint_files'].append(file)
                elif file.startswith('extracted_entities_batch_'):
                    file_categories['extraction_files'].append(file)
                elif file.startswith('extraction_summary_batch_'):
                    file_categories['summary_files'].append(file)
                else:
                    file_categories['other_files'].append(file)

            # --- 3. Display file analysis ---
            print(f"\n📋 FILE ANALYSIS:")
            for category, files in file_categories.items():
                if files:
                    print(f"   {category.replace('_', ' ').title()}: {len(files)} files")
                    for file in sorted(files):
                        file_path = os.path.join(CHECKPOINT_FOLDER, file)
                        file_size = os.path.getsize(file_path)
                        file_time = datetime.fromtimestamp(os.path.getmtime(file_path))
                        print(f"     • {file} ({file_size:,} bytes, {file_time.strftime('%Y-%m-%d %H:%M')})")

            # --- 4. Extract batch numbers from different file types ---
            print(f"\n🔢 BATCH NUMBER ANALYSIS:")

            # From checkpoint files
            checkpoint_batches = []
            for file in file_categories['checkpoint_files']:
                try:
                    batch_num = int(file.split('_')[-1].split('.')[0])
                    checkpoint_batches.append(batch_num)
                except:
                    pass

            # From extraction files
            extraction_batches = []
            for file in file_categories['extraction_files']:
                try:
                    batch_num = int(file.split('_')[-1].split('.')[0])
                    extraction_batches.append(batch_num)
                except:
                    pass

            # From summary files
            summary_batches = []
            for file in file_categories['summary_files']:
                try:
                    batch_num = int(file.split('_')[-1].split('.')[0])
                    summary_batches.append(batch_num)
                except:
                    pass

            print(f"   Checkpoint batches: {sorted(checkpoint_batches) if checkpoint_batches else 'None'}")
            print(f"   Extraction batches: {sorted(extraction_batches) if extraction_batches else 'None'}")
            print(f"   Summary batches: {sorted(summary_batches) if summary_batches else 'None'}")

            # --- 5. Sample file contents ---
            print(f"\n📄 SAMPLE FILE CONTENTS:")

            # Check latest checkpoint file
            if file_categories['checkpoint_files']:
                latest_checkpoint = max(file_categories['checkpoint_files'],
                                      key=lambda x: int(x.split('_')[-1].split('.')[0]))
                checkpoint_path = os.path.join(CHECKPOINT_FOLDER, latest_checkpoint)

                print(f"\n   📋 Latest Checkpoint: {latest_checkpoint}")
                try:
                    with open(checkpoint_path, 'r') as f:
                        checkpoint_data = json.load(f)

                    print(f"     • Timestamp: {checkpoint_data.get('timestamp', 'N/A')}")
                    print(f"     • Dataset type: {checkpoint_data.get('dataset_type', 'N/A')}")
                    print(f"     • Batch number: {checkpoint_data.get('batch_number', 'N/A')}")
                    print(f"     • Total batches: {checkpoint_data.get('total_batches', 'N/A')}")

                    if 'extracted_stats' in checkpoint_data:
                        print(f"     • Has extraction stats: ✅")
                    else:
                        print(f"     • Has extraction stats: ❌")

                except Exception as e:
                    print(f"     ❌ Error reading checkpoint: {e}")

            # Check an extraction file if exists
            if file_categories['extraction_files']:
                sample_extraction = file_categories['extraction_files'][0]
                extraction_path = os.path.join(CHECKPOINT_FOLDER, sample_extraction)

                print(f"\n   🔍 Sample Extraction: {sample_extraction}")
                try:
                    with open(extraction_path, 'r') as f:
                        extraction_data = json.load(f)

                    print(f"     • Batch number: {extraction_data.get('batch_number', 'N/A')}")
                    print(f"     • Jobs processed: {extraction_data.get('jobs_processed', 'N/A')}")
                    print(f"     • Total jobs: {extraction_data.get('total_jobs', 'N/A')}")

                    # Check extraction results
                    results = extraction_data.get('extraction_results', [])
                    print(f"     • Extraction results: {len(results)} jobs")

                    if results:
                        sample_result = results[0]
                        entities = sample_result.get('extraction_data', {}).get('entities', [])
                        relationships = sample_result.get('extraction_data', {}).get('relationships', [])
                        print(f"     • Sample job entities: {len(entities)}")
                        print(f"     • Sample job relationships: {len(relationships)}")

                    # Check insights
                    insights = extraction_data.get('batch_insights', {})
                    if insights:
                        print(f"     • Total entities: {insights.get('total_entities', 0)}")
                        print(f"     • Total relationships: {insights.get('total_relationships', 0)}")
                        print(f"     • AI/ML mentions: {insights.get('ai_ml_mentions', 0)}")

                except Exception as e:
                    print(f"     ❌ Error reading extraction file: {e}")

            # --- 6. Determine actual status ---
            print(f"\n🎯 STATUS DETERMINATION:")

            expected_batches = 15  # From your output

            if extraction_batches:
                completed_extractions = len(extraction_batches)
                missing_batches = set(range(1, expected_batches + 1)) - set(extraction_batches)

                print(f"   ✅ Completed extractions: {completed_extractions}/15")
                print(f"   📋 Completed batches: {sorted(extraction_batches)}")

                if missing_batches:
                    print(f"   ⏳ Missing batches: {sorted(missing_batches)}")
                else:
                    print(f"   🎉 ALL BATCHES COMPLETED!")
            else:
                print(f"   ❌ No extraction files found")
                print(f"   📝 Status: Need to start extraction from beginning")

            # --- 7. Recommendations ---
            print(f"\n💡 RECOMMENDATIONS:")

            if len(extraction_batches) == expected_batches:
                print(f"   🎯 All extractions complete - proceed to knowledge graph analysis")
                print(f"   📊 Skip extraction phase - move to graph construction")
            elif extraction_batches:
                missing_count = expected_batches - len(extraction_batches)
                print(f"   ⚡ Resume extraction from batch {max(extraction_batches) + 1}")
                print(f"   📈 {missing_count} batches remaining")
            else:
                print(f"   🚀 Start fresh extraction from batch 1")
                print(f"   📝 No previous extraction work found")

    else:
        print("❌ Folder does not exist")
        print("📝 Recommendation: Create folder and start fresh extraction")

except Exception as e:
    print(f"❌ Error investigating folder: {e}")

print("\n" + "="*60)
print("🔍 INVESTIGATION COMPLETE")
print("="*60)

🔍 INVESTIGATING CHECKPOINT FOLDER
📁 Read-only analysis of existing files
📂 Checkpoint folder: /content/drive/MyDrive/knowledge-graph-llms/Research extraction batches
✅ Folder exists
📊 Found 15 files

📋 FILE ANALYSIS:
   Checkpoint Files: 15 files
     • research_checkpoint_batch_1.json (3,435 bytes, 2025-06-26 21:42)
     • research_checkpoint_batch_10.json (9,506 bytes, 2025-06-27 00:02)
     • research_checkpoint_batch_11.json (10,182 bytes, 2025-06-27 00:09)
     • research_checkpoint_batch_12.json (10,866 bytes, 2025-06-27 00:15)
     • research_checkpoint_batch_13.json (11,545 bytes, 2025-06-27 00:19)
     • research_checkpoint_batch_14.json (12,223 bytes, 2025-06-27 00:29)
     • research_checkpoint_batch_15.json (12,757 bytes, 2025-06-27 00:29)
     • research_checkpoint_batch_2.json (4,110 bytes, 2025-06-26 23:06)
     • research_checkpoint_batch_3.json (4,787 bytes, 2025-06-26 23:10)
     • research_checkpoint_batch_4.json (5,469 bytes, 2025-06-26 23:16)
     • research_checkp

**New Cell 6**

In [24]:
# Cell 6: Production-Ready Research Graph Extraction
# Extract Roles, Tasks, Technical Skills, and Soft Skills with Production Fixes

import time
import logging
import tempfile
import shutil
from collections import Counter
import json
import os

print("🚀 PRODUCTION-READY RESEARCH GRAPH EXTRACTION")
print("🧠 Dataset: GPT-4o Validated | Extraction: Production-Quality")
print("🎯 Target: Roles → Tasks → Technical Skills → Soft Skills")
print("="*60)

# --- 1. Setup Logging (with handler management) ---
def setup_logging():
    """Setup structured logging for the extraction pipeline"""
    logger = logging.getLogger(__name__)

    # Clear existing handlers to prevent duplicates on re-execution
    if logger.handlers:
        logger.handlers.clear()

    # Also clear root logger handlers if they exist
    root_logger = logging.getLogger()
    if root_logger.handlers:
        root_logger.handlers.clear()

    # Ensure log directory exists
    log_dir = RESEARCH_CHECKPOINT_FOLDER if 'RESEARCH_CHECKPOINT_FOLDER' in globals() else '/tmp'
    os.makedirs(log_dir, exist_ok=True)

    log_format = '%(asctime)s - %(levelname)s - %(message)s'
    logging.basicConfig(
        level=logging.INFO,
        format=log_format,
        handlers=[
            logging.StreamHandler(),  # Console output
            logging.FileHandler(os.path.join(log_dir, 'extraction_pipeline.log'))
        ]
    )
    return logger

# --- 2. ResearchETL Class for Better Abstraction ---
class ResearchETL:
    """Lightweight ETL class for research extraction pipeline"""

    def __init__(self, checkpoint_folder, graph_transformer, neo4j_graph,
                 batches, batch_info, convert_fn, save_checkpoint_fn):
        self.checkpoint_folder = checkpoint_folder
        self.graph_transformer = graph_transformer
        self.neo4j_graph = neo4j_graph
        self.batches = batches
        self.batch_info = batch_info
        self.convert_batch_to_documents = convert_fn
        self.save_batch_checkpoint = save_checkpoint_fn
        self.logger = setup_logging()
        self.progress_tracker = ProgressTracker()

        # Entity type classification lookup table (refined for edge cases)
        self.entity_classifiers = {
            'role_keywords': ['manager', 'engineer', 'analyst', 'specialist', 'coordinator', 'director', 'lead', 'head'],
            'task_keywords': ['task', 'responsibility', 'duty', 'manage', 'develop', 'implement', 'coordinate', 'oversee'],
            'soft_skill_keywords': ['communication', 'leadership', 'teamwork', 'problem solving', 'analytical thinking', 'collaboration'],
            'technical_skill_keywords': ['software', 'platform', 'programming', 'technology', 'system', 'framework', 'database', 'cloud']
        }

    def get_next_batch_to_process(self, force_restart=False, auto_cleanup=False):
        """Find next research batch that needs extraction"""

        if force_restart:
            self.logger.info("Force restart: Starting fresh from batch 1")
            if auto_cleanup:
                self._cleanup_extraction_files()
            return 1

        # Check for actual extraction files
        actual_extractions = []
        try:
            extraction_files = [f for f in os.listdir(self.checkpoint_folder)
                              if f.startswith('extracted_entities_batch_') or
                                 f.startswith('extraction_results_batch_')]
            actual_extractions = [int(f.split('_')[-1].split('.')[0]) for f in extraction_files]
            actual_extractions.sort()
        except FileNotFoundError as e:
            self.logger.warning(f"Checkpoint folder not found: {e}")
            return 1
        except ValueError as e:
            self.logger.error(f"Error parsing extraction file numbers: {e}")
            return 1

        if not actual_extractions:
            self.logger.info("No actual extractions found - starting fresh from batch 1")
            return 1

        # Find first missing batch (handles non-contiguous gaps)
        total_batches = len(self.batches)
        for i in range(1, total_batches + 1):
            if i not in actual_extractions:
                self.logger.info(f"Found gap: processing batch {i} (completed: {actual_extractions})")
                return i

        # All batches completed
        return None

    def _cleanup_extraction_files(self):
        """Clean up previous extraction files"""
        try:
            extraction_files = [f for f in os.listdir(self.checkpoint_folder)
                              if f.startswith('extracted_entities_batch_') or
                                 f.startswith('extraction_results_batch_')]
            if extraction_files:
                self.logger.info(f"Cleaning up {len(extraction_files)} previous extraction files")
                for file in extraction_files:
                    file_path = os.path.join(self.checkpoint_folder, file)
                    os.remove(file_path)
                self.logger.info(f"Cleaned up {len(extraction_files)} files")
        except OSError as e:
            self.logger.error(f"Could not clean extraction files: {e}")

    def classify_entity_type(self, original_type):
        """Classify entity type using refined lookup table approach"""
        original_lower = original_type.lower()

        # Explicit role indicators first (most specific)
        if any(keyword in original_lower for keyword in self.entity_classifiers['role_keywords']):
            return 'Role'

        # Task indicators
        if any(keyword in original_lower for keyword in self.entity_classifiers['task_keywords']):
            return 'Task'

        # Soft skill (check before technical to handle "soft skill" properly)
        if 'soft skill' in original_lower or any(keyword in original_lower for keyword in self.entity_classifiers['soft_skill_keywords']):
            return 'Soft skill'

        # Technical skill and tool mapping
        if ('tool' in original_lower and 'lead' not in original_lower) or any(keyword in original_lower for keyword in self.entity_classifiers['technical_skill_keywords']):
            return 'Technical skill'

        # Keep original if no clear match
        return original_type.title()

    def classify_entity_type_with_context(self, node):
        """Classify entity type using both type string and node ID/name context"""
        original_type = node.type
        original_lower = original_type.lower()

        # Get node name/id for context-aware classification
        node_name = ""
        if hasattr(node, 'id') and node.id:
            node_name = str(node.id).lower()
        elif hasattr(node, 'name') and node.name:
            node_name = str(node.name).lower()

        # Known technical entities that often get misclassified as "Skill"
        tech_indicators = [
            # Programming languages
            'python', 'java', 'javascript', 'c#', 'c++', '.net', 'sql', 'r', 'scala', 'go', 'rust',
            # Frameworks/Libraries
            'react', 'angular', 'vue', 'django', 'flask', 'spring', 'tensorflow', 'pytorch',
            # Software/Tools
            'excel', 'powerbi', 'tableau', 'autocad', 'revit', 'solidworks', 'matlab', 'sap',
            # Cloud/Infrastructure
            'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'linux',
            # Databases
            'mysql', 'postgresql', 'mongodb', 'oracle', 'redis', 'elasticsearch'
        ]

        # Explicit role indicators first (most specific)
        if any(keyword in original_lower for keyword in self.entity_classifiers['role_keywords']):
            return 'Role'

        # Task indicators
        if any(keyword in original_lower for keyword in self.entity_classifiers['task_keywords']):
            return 'Task'

        # Handle generic "Skill" with context from node name
        if original_lower == 'skill':
            # Check if node name suggests technical skill
            if any(tech in node_name for tech in tech_indicators):
                return 'Technical skill'
            # Check if node name suggests soft skill
            elif any(keyword in node_name for keyword in self.entity_classifiers['soft_skill_keywords']):
                return 'Soft skill'
            # Default generic skills to Technical (most programming/tools end up here)
            else:
                return 'Technical skill'

        # Soft skill (check before technical to handle "soft skill" properly)
        if 'soft skill' in original_lower or any(keyword in original_lower for keyword in self.entity_classifiers['soft_skill_keywords']):
            return 'Soft skill'

        # Technical skill and tool mapping
        if ('tool' in original_lower and 'lead' not in original_lower) or any(keyword in original_lower for keyword in self.entity_classifiers['technical_skill_keywords']):
            return 'Technical skill'

        # Generic skill fallback (but avoid role titles with "skill" in them)
        if 'skill' in original_lower and not any(role_kw in original_lower for role_kw in self.entity_classifiers['role_keywords']):
            return 'Technical skill'  # Changed default from Soft to Technical

        # Keep original if no clear match
        return original_type.title()

    def process_and_clean_entities(self, batch_graph_documents):
        """Clean and standardize extracted entities using context-aware classification"""
        cleaned_documents = []

        for graph_doc in batch_graph_documents:
            cleaned_nodes = []
            for node in graph_doc.nodes:
                node.type = self.classify_entity_type_with_context(node)
                cleaned_nodes.append(node)

            graph_doc.nodes = cleaned_nodes
            cleaned_documents.append(graph_doc)

        return cleaned_documents

    def count_research_entities(self, node_types_counter):
        """Count research entity types from Counter object"""
        entity_counts = {
            'roles': node_types_counter.get('Role', 0),
            'tasks': node_types_counter.get('Task', 0),
            'technical_skills': node_types_counter.get('Technical skill', 0),
            'soft_skills': node_types_counter.get('Soft skill', 0),
            'other': sum(count for node_type, count in node_types_counter.items()
                        if node_type not in ['Role', 'Task', 'Technical skill', 'Soft skill'])
        }

        target_types = {'Role', 'Task', 'Technical skill', 'Soft skill'}
        return entity_counts, target_types

    def save_results_atomically(self, batch_number, extraction_results):
        """Save results with atomic write for resilience"""
        # Ensure checkpoint directory exists
        os.makedirs(self.checkpoint_folder, exist_ok=True)

        results_file = os.path.join(self.checkpoint_folder, f"extraction_results_batch_{batch_number}.json")

        try:
            # Write to temporary file first
            with tempfile.NamedTemporaryFile(mode='w', delete=False,
                                           dir=self.checkpoint_folder,
                                           prefix=f'tmp_batch_{batch_number}_') as tmp_file:
                json.dump(extraction_results, tmp_file, indent=2)
                tmp_path = tmp_file.name

            # Atomic move to final location
            shutil.move(tmp_path, results_file)
            self.logger.info(f"Extraction results saved atomically: extraction_results_batch_{batch_number}.json")
            return True

        except (OSError, IOError) as e:
            self.logger.error(f"Error saving extraction results: {e}")
            # Clean up temp file if it exists
            try:
                if 'tmp_path' in locals():
                    os.unlink(tmp_path)
            except OSError:
                pass
            return False

    def process_batch(self, batch_number):
        """Process a single batch with comprehensive error handling"""
        self.logger.info(f"Starting processing of batch {batch_number}")

        try:
            batch_df = self.batches[batch_number - 1]

            # Convert to documents
            self.logger.info(f"Converting batch {batch_number} to documents")
            batch_documents = self.convert_batch_to_documents(batch_df, batch_number)

            # Extract using graph transformer
            self.logger.info(f"Extracting entities from batch {batch_number}")
            start_time = time.time()

            batch_graph_documents = self.graph_transformer.convert_to_graph_documents(batch_documents)

            # Clean and standardize
            cleaned_graph_documents = self.process_and_clean_entities(batch_graph_documents)

            processing_time = time.time() - start_time
            self.progress_tracker.add_time(processing_time)

            # Analyze results (with safe attribute access)
            batch_nodes = []
            batch_relationships = []

            for graph_doc in cleaned_graph_documents:
                for node in graph_doc.nodes:
                    if hasattr(node, 'id') and hasattr(node, 'type'):
                        batch_nodes.append(node)
                    else:
                        self.logger.warning(f"Node missing id or type attributes: {node}")

                for rel in graph_doc.relationships:
                    if hasattr(rel, 'type'):
                        batch_relationships.append(rel)
                    else:
                        self.logger.warning(f"Relationship missing type attribute: {rel}")

            node_types = Counter([node.type for node in batch_nodes])
            relationship_types = Counter([rel.type for rel in batch_relationships])

            # Add to Neo4j
            self.logger.info(f"Adding batch {batch_number} to Neo4j")
            self.neo4j_graph.add_graph_documents(cleaned_graph_documents)

            # Prepare results
            research_entity_counts, target_types = self.count_research_entities(node_types)

            target_types_found = set(node_types.keys()) & target_types
            consistency_score = len(target_types_found) / len(target_types)

            extraction_results = {
                'batch_number': batch_number,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'processing_time': processing_time,
                'documents_processed': len(batch_documents),
                'total_entities': len(batch_nodes),
                'total_relationships': len(batch_relationships),
                'entity_types': dict(node_types),
                'relationship_types': dict(relationship_types),
                'research_entity_counts': research_entity_counts,
                'quality_metrics': {
                    'entity_type_consistency': consistency_score,
                    'neo4j_insertion_success': True,
                    'tools_successfully_merged': node_types.get('Tool', 0) == 0
                },
                'sample_entities': {
                    'roles': list(set([str(n.id) for n in batch_nodes if n.type == 'Role']))[:5],
                    'tasks': list(set([str(n.id) for n in batch_nodes if n.type == 'Task']))[:5],
                    'technical_skills': list(set([str(n.id) for n in batch_nodes if n.type == 'Technical skill']))[:5],
                    'soft_skills': list(set([str(n.id) for n in batch_nodes if n.type == 'Soft skill']))[:5]
                }
            }

            # Save results atomically
            if self.save_results_atomically(batch_number, extraction_results):
                # Update batch info only if save succeeded
                self.batch_info[batch_number - 1]['status'] = 'completed'
                self.batch_info[batch_number - 1]['processing_time'] = processing_time

                # Save checkpoint
                try:
                    self.save_batch_checkpoint(batch_number, self.batch_info, extraction_results)
                except Exception as e:
                    self.logger.error(f"Error saving checkpoint: {e}")
                    # Don't fail the batch for checkpoint issues

                self.logger.info(f"Batch {batch_number} completed successfully")
                return extraction_results
            else:
                raise RuntimeError("Failed to save extraction results")

        except Exception as e:
            self.logger.error(f"Error processing batch {batch_number}: {e}")
            raise

# --- 3. Progress Tracking with Moving Average ---
class ProgressTracker:
    def __init__(self):
        self.processing_times = []
        self.max_history = 5

    def add_time(self, time_seconds):
        self.processing_times.append(time_seconds)
        if len(self.processing_times) > self.max_history:
            self.processing_times.pop(0)

    def estimate_remaining_time(self, remaining_batches):
        if not self.processing_times:
            return 0
        avg_time = sum(self.processing_times) / len(self.processing_times)
        return (avg_time * remaining_batches) / 60

# --- 4. Prerequisites Check ---
def check_prerequisites():
    """Check all required variables and return missing ones"""
    required_vars = {
        'batches': 'batches (run Cell 5B)',
        'graph_transformer': 'graph_transformer (run Cell 2)',
        'graph': 'Neo4j graph connection (run Cell 2)',
        'convert_batch_to_documents': 'convert_batch_to_documents function (run Cell 5B)',
        'save_batch_checkpoint': 'save_batch_checkpoint function (run Cell 5B)',
        'batch_info': 'batch_info variable (run Cell 5B)',
        'RESEARCH_CHECKPOINT_FOLDER': 'RESEARCH_CHECKPOINT_FOLDER (run Cell 5B)'
    }

    missing = []
    for var_name, description in required_vars.items():
        if var_name not in globals() or globals()[var_name] is None:
            missing.append(description)

    return missing

# --- 5. Main Execution ---

# Configuration
FORCE_RESTART = False  # Set to True to start from batch 1
AUTO_CLEANUP = True    # Automatically clean files on restart (no prompt)

# Check prerequisites
missing_prereqs = check_prerequisites()
if missing_prereqs:
    print("❌ Missing prerequisites:")
    for prereq in missing_prereqs:
        print(f"   - {prereq}")
    print("⚠️  Cannot proceed. Please run previous cells first.")
    raise RuntimeError(f"Missing prerequisites: {missing_prereqs}")

print("✅ All prerequisites available")

# Initialize ETL pipeline
try:
    etl = ResearchETL(
        checkpoint_folder=RESEARCH_CHECKPOINT_FOLDER,
        graph_transformer=graph_transformer,
        neo4j_graph=graph,
        batches=batches,
        batch_info=batch_info,
        convert_fn=convert_batch_to_documents,
        save_checkpoint_fn=save_batch_checkpoint
    )
    print("✅ ETL pipeline initialized")
except Exception as e:
    print(f"❌ Failed to initialize ETL pipeline: {e}")
    raise

# Determine current batch
current_batch_num = etl.get_next_batch_to_process(
    force_restart=FORCE_RESTART,
    auto_cleanup=AUTO_CLEANUP
)

if current_batch_num is None:
    print("🎉 ALL RESEARCH BATCHES COMPLETED!")
    print(f"✅ All {len(batches)} research batches processed successfully")
    print("📊 Ready for role and skills analysis!")

    # Show final statistics
    try:
        target_entity_stats_query = """
        MATCH (n)
        WHERE labels(n)[0] IN ['Role', 'Task', 'Technical skill', 'Soft skill']
        RETURN labels(n)[0] as entity_type, count(n) as count
        ORDER BY
            CASE labels(n)[0]
                WHEN 'Role' THEN 1
                WHEN 'Task' THEN 2
                WHEN 'Technical skill' THEN 3
                WHEN 'Soft skill' THEN 4
            END
        """
        target_stats = graph.query(target_entity_stats_query)
        rel_count = graph.query("MATCH ()-[r]-() RETURN count(r) as count")[0]['count']

        print(f"\n📊 FINAL RESEARCH EXTRACTION RESULTS:")
        total_target_entities = sum([stat['count'] for stat in target_stats])
        print(f"  Target Research Entities: {total_target_entities}")
        for stat in target_stats:
            print(f"    - {stat['entity_type']}: {stat['count']}")
        print(f"  Total Relationships: {rel_count}")

        jobs_processed = sum([len(batch) for batch in batches])
        print(f"\n🎯 RESEARCH PRODUCTIVITY:")
        print(f"  📋 Jobs processed: {jobs_processed}")
        print(f"  🏗️  Target entities per job: {total_target_entities/jobs_processed:.1f}")
        print(f"  🔗 Relationships per job: {rel_count/jobs_processed:.1f}")

        print(f"\n🚀 RESEARCH ANALYSIS READY:")
        print(f"  1. Role emergence analysis")
        print(f"  2. Skills requirement mapping")
        print(f"  3. Construction + AI/ML transformation insights")

    except Exception as e:
        etl.logger.error(f"Error getting final stats: {e}")
        print(f"⚠️  Error getting final stats: {e}")

else:
    current_batch_df = batches[current_batch_num - 1]

    print(f"📦 PROCESSING RESEARCH BATCH {current_batch_num}/{len(batches)}")
    print(f"   Jobs in this batch: {len(current_batch_df)}")
    print(f"   🎯 Target: Role, Task, Technical skill, Soft skill")
    print(f"   🔧 Quality: Production-ready with atomic saves")

    # Show sample jobs
    print(f"\n📝 SAMPLE JOBS IN THIS BATCH:")
    for i in range(min(3, len(current_batch_df))):
        row = current_batch_df.iloc[i]
        employer = row.get('Employer', 'Unknown')
        title_preview = row['Title'][:60] + "..." if len(row['Title']) > 60 else row['Title']
        print(f"   • {title_preview} @ {employer}")

    # Process the batch
    try:
        print(f"\n🤖 Processing batch {current_batch_num}...")
        results = etl.process_batch(current_batch_num)

        # Display results
        print(f"✅ Batch {current_batch_num} processed in {results['processing_time']:.1f} seconds")
        print(f"\n📈 EXTRACTION RESULTS:")
        print(f"  📊 Total entities: {results['total_entities']}")

        for entity_type in ['Role', 'Task', 'Technical skill', 'Soft skill']:
            count = results['entity_types'].get(entity_type, 0)
            print(f"    - {entity_type}: {count}")

        print(f"  🔗 Total relationships: {results['total_relationships']}")

        # Show samples
        print(f"\n📝 SAMPLE ENTITIES:")
        for category, samples in results['sample_entities'].items():
            if samples:
                print(f"  {category}: {', '.join(samples)}")

        # Progress summary (accurate completion counting)
        completed_batches = sum(1 for b in etl.batch_info if b.get('status') == 'completed')
        remaining_batches = len(batches) - completed_batches

        print(f"\n" + "="*60)
        print(f"🎯 BATCH {current_batch_num} COMPLETE!")
        print(f"="*60)
        print(f"✅ Progress: {completed_batches}/{len(batches)} batches")
        print(f"⏳ Remaining: {remaining_batches} batches")

        if remaining_batches > 0:
            estimated_time = etl.progress_tracker.estimate_remaining_time(remaining_batches)
            print(f"⏱️  Estimated remaining: {estimated_time:.1f} minutes")
            print(f"\n🚀 NEXT: Run this cell again for batch {current_batch_num + 1}")
        else:
            print(f"🎉 ALL BATCHES COMPLETE!")

        print(f"="*60)

    except Exception as e:
        etl.logger.error(f"Batch {current_batch_num} failed: {e}")
        print(f"❌ BATCH {current_batch_num} FAILED!")
        print(f"⚠️  Error: {e}")
        print(f"🔄 Fix the issue and run this cell again to retry")
        raise

2025-06-29 08:21:31,729 - INFO - Found gap: processing batch 15 (completed: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
2025-06-29 08:21:31,737 - INFO - Starting processing of batch 15
2025-06-29 08:21:31,739 - INFO - Converting batch 15 to documents
2025-06-29 08:21:31,743 - INFO - Extracting entities from batch 15


🚀 PRODUCTION-READY RESEARCH GRAPH EXTRACTION
🧠 Dataset: GPT-4o Validated | Extraction: Production-Quality
🎯 Target: Roles → Tasks → Technical Skills → Soft Skills
✅ All prerequisites available
✅ ETL pipeline initialized
📦 PROCESSING RESEARCH BATCH 15/15
   Jobs in this batch: 3
   🎯 Target: Role, Task, Technical skill, Soft skill
   🔧 Quality: Production-ready with atomic saves

📝 SAMPLE JOBS IN THIS BATCH:
   • Electricians @ Ventia
   • Directors of Business Intelligence @ V-Line Corporation
   • BIM Managers @ Metricon Homes

🤖 Processing batch 15...


2025-06-29 08:21:41,100 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-29 08:21:49,878 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-29 08:22:00,563 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-29 08:22:00,586 - INFO - Adding batch 15 to Neo4j
2025-06-29 08:22:02,998 - INFO - Extraction results saved atomically: extraction_results_batch_15.json
2025-06-29 08:22:03,019 - INFO - Batch 15 completed successfully


✅ Research checkpoint saved: batch_15.json
✅ Batch 15 processed in 28.8 seconds

📈 EXTRACTION RESULTS:
  📊 Total entities: 42
    - Role: 3
    - Task: 15
    - Technical skill: 17
    - Soft skill: 7
  🔗 Total relationships: 50

📝 SAMPLE ENTITIES:
  roles: Director Of Data Management And Business Intelligence, Electrician, Bim Manager
  tasks: System Implementation, Radio & Optic Fibre Technologies Fault Diagnostics And Rectification, Data Strategy, Content Library Management, Emergency Power Plants
  technical_skills: Bim Systems, Confined Space, Qld Electrical Licence, Maximo, Business Intelligence Systems
  soft_skills: Communication, Problem-Solving, Leadership, Project Management

🎯 BATCH 15 COMPLETE!
✅ Progress: 15/15 batches
⏳ Remaining: 0 batches
🎉 ALL BATCHES COMPLETE!


In [16]:
import pandas as pd   # already imported earlier

# Normalise every batch dataframe
for df in batches:
    df['Title']    = df['Title'].fillna('Untitled role').astype(str)
    df['Employer'] = df['Employer'].fillna('Unknown').astype(str)


**Cell 6: Interactive Research Graph Extraction**


In [7]:
# Cell 6: Interactive Research Graph Extraction
# Process GPT-4o validated construction + AI/ML jobs with GPT-4o-mini

import time
from collections import Counter
import json

print("🚀 INTERACTIVE RESEARCH GRAPH EXTRACTION")
print("🧠 Dataset: GPT-4o Validated | Extraction: GPT-4o-mini")
print("🎯 Focus: Construction + AI/ML Ecosystem")
print("="*60)

# --- 1. Determine Next Research Batch ---
def get_next_research_batch_to_process():
    """Find next research batch to process"""
    completed_batches = []

    try:
        checkpoint_files = [f for f in os.listdir(RESEARCH_CHECKPOINT_FOLDER)
                          if f.startswith('research_checkpoint_batch_')]
        completed_batches = [int(f.split('_')[-1].split('.')[0]) for f in checkpoint_files]
        completed_batches.sort()
    except:
        completed_batches = []

    if not completed_batches:
        return 1

    next_batch = max(completed_batches) + 1
    return next_batch if next_batch <= len(batches) else None

# --- 2. Check Prerequisites ---
prerequisites_ok = True

if 'batches' not in locals() or not batches:
    print("❌ No research batches found. Please run Cell 5B first.")
    prerequisites_ok = False

if 'graph_transformer' not in locals() or graph_transformer is None:
    print("❌ Graph transformer not available. Please run Cell 2 first.")
    prerequisites_ok = False

if 'graph' not in locals() or graph is None:
    print("❌ Neo4j graph not available. Please run Cell 2 first.")
    prerequisites_ok = False

if not prerequisites_ok:
    print("⚠️  Cannot proceed. Please run previous cells first.")
else:
    # --- 3. Determine Current Research Batch ---
    current_batch_num = get_next_research_batch_to_process()

    if current_batch_num is None:
        print("🎉 ALL RESEARCH BATCHES COMPLETED!")
        print("✅ All 15 research batches processed successfully")
        print("🧠 GPT-4o validated dataset → Clean knowledge graph")

        # Show final research graph statistics
        try:
            final_stats_query = """
            MATCH (n)
            RETURN labels(n)[0] as node_type, count(n) as count
            ORDER BY count DESC
            """
            final_stats = graph.query(final_stats_query)
            rel_count = graph.query("MATCH ()-[r]-() RETURN count(r) as count")[0]['count']

            print(f"\n📊 FINAL RESEARCH GRAPH STATISTICS:")
            total_nodes = sum([stat['count'] for stat in final_stats])
            print(f"  Total Nodes: {total_nodes}")
            for stat in final_stats:
                print(f"    - {stat['node_type']}: {stat['count']}")
            print(f"  Total Relationships: {rel_count}")

            # Calculate research metrics
            print(f"\n🎯 RESEARCH METRICS:")
            print(f"  📊 Jobs processed: 282 (GPT-4o validated)")
            print(f"  🏗️  Entities per job: {total_nodes/282:.1f}")
            print(f"  🔗 Relationships per job: {rel_count/282:.1f}")
            print(f"  🧠 Dataset quality: Research-grade")

        except Exception as e:
            print(f"⚠️  Error getting final stats: {e}")

    else:
        current_batch_df = batches[current_batch_num - 1]

        print(f"📦 PROCESSING RESEARCH BATCH {current_batch_num}/15")
        print(f"   Jobs in this batch: {len(current_batch_df)}")
        print(f"   Job range: {(current_batch_num-1)*20 + 1} to {min(current_batch_num*20, 282)}")
        print(f"   🧠 Dataset: GPT-4o validated construction + AI/ML")

        # Show sample research jobs in this batch
        print(f"\n📝 SAMPLE RESEARCH JOBS IN THIS BATCH:")
        for i in range(min(3, len(current_batch_df))):
            row = current_batch_df.iloc[i]
            employer = row.get('Employer', 'Unknown')
            print(f"   • {row['Title']} at {employer} ({len(str(row['JobText']))} chars)")

        # --- 4. Convert Research Batch to Documents ---
        try:
            print(f"\n🔄 Converting research batch to documents...")
            batch_documents = convert_batch_to_documents(current_batch_df, current_batch_num)
            print(f"✅ Created {len(batch_documents)} research documents")

        except Exception as e:
            print(f"❌ Error converting batch to documents: {e}")
            batch_documents = []

        # --- 5. Process with GPT-4o-mini Graph Transformer ---
        if batch_documents:
            try:
                print(f"\n🤖 Processing with GPT-4o-mini (research-focused extraction)...")
                start_time = time.time()

                # Extract graph documents using the transformer
                batch_graph_documents = graph_transformer.convert_to_graph_documents(batch_documents)

                processing_time = time.time() - start_time
                print(f"✅ Research batch processed in {processing_time:.1f} seconds")
                print(f"📊 Generated {len(batch_graph_documents)} graph documents")

                # --- 6. Analyze Research Extraction Results ---
                batch_nodes = []
                batch_relationships = []

                for graph_doc in batch_graph_documents:
                    batch_nodes.extend(graph_doc.nodes)
                    batch_relationships.extend(graph_doc.relationships)

                # Count by type
                node_types = Counter([node.type for node in batch_nodes])
                relationship_types = Counter([rel.type for rel in batch_relationships])

                print(f"\n📈 RESEARCH BATCH {current_batch_num} RESULTS:")
                print(f"  Research entities extracted: {len(batch_nodes)}")
                for node_type, count in node_types.most_common():
                    print(f"    - {node_type}: {count}")

                print(f"  Research relationships extracted: {len(batch_relationships)}")
                for rel_type, count in relationship_types.most_common():
                    print(f"    - {rel_type}: {count}")

                # Show sample entities for research validation
                print(f"\n📝 SAMPLE RESEARCH ENTITIES:")
                for node_type in ['Role', 'Technical skill', 'Soft skill', 'Tool']:
                    sample_nodes = [node.id for node in batch_nodes if node.type == node_type][:3]
                    if sample_nodes:
                        print(f"  {node_type}: {', '.join(sample_nodes)}")

                # --- 7. Add to Neo4j Research Graph ---
                try:
                    print(f"\n💾 Adding to research knowledge graph...")
                    graph.add_graph_documents(batch_graph_documents)
                    print(f"✅ Research batch {current_batch_num} successfully added to Neo4j")

                    # Get updated research graph statistics
                    db_stats_query = """
                    MATCH (n)
                    RETURN labels(n)[0] as node_type, count(n) as count
                    ORDER BY count DESC
                    """
                    db_stats = graph.query(db_stats_query)
                    rel_count = graph.query("MATCH ()-[r]-() RETURN count(r) as count")[0]['count']

                    print(f"\n📊 UPDATED RESEARCH GRAPH:")
                    total_nodes = sum([stat['count'] for stat in db_stats])
                    print(f"  Total Nodes: {total_nodes}")
                    for stat in db_stats[:5]:  # Show top 5 node types
                        print(f"    - {stat['node_type']}: {stat['count']}")
                    print(f"  Total Relationships: {rel_count}")

                except Exception as e:
                    print(f"❌ Error adding to Neo4j: {e}")
                    batch_graph_documents = []

                # --- 8. Save Research Checkpoint ---
                extraction_stats = {
                    'processing_time': processing_time,
                    'documents_processed': len(batch_documents),
                    'entities_extracted': len(batch_nodes),
                    'relationships_extracted': len(batch_relationships),
                    'node_types': dict(node_types),
                    'relationship_types': dict(relationship_types),
                    'dataset_type': 'research_validated'
                }

                # Update batch info
                batch_info[current_batch_num - 1]['status'] = 'completed'
                batch_info[current_batch_num - 1]['processing_time'] = processing_time
                batch_info[current_batch_num - 1]['extraction_stats'] = extraction_stats

                checkpoint_saved = save_batch_checkpoint(current_batch_num, batch_info, extraction_stats)

                # --- 9. Research Progress Summary ---
                completed_batches = current_batch_num
                remaining_batches = len(batches) - completed_batches

                print(f"\n" + "="*60)
                print(f"🎯 RESEARCH BATCH {current_batch_num} COMPLETE!")
                print(f"="*60)
                print(f"✅ Processed: {completed_batches}/{len(batches)} batches")
                print(f"⏳ Remaining: {remaining_batches} batches")
                print(f"🧠 Quality: GPT-4o validated → GPT-4o-mini extracted")

                if remaining_batches > 0:
                    estimated_remaining_time = remaining_batches * (processing_time / 60)
                    print(f"⏱️  Estimated remaining time: {estimated_remaining_time:.1f} minutes")
                    print(f"\n🚀 NEXT: Run this cell again for Research Batch {current_batch_num + 1}")
                else:
                    print(f"🎉 ALL RESEARCH BATCHES COMPLETED!")
                    print(f"📊 Ready for construction + AI/ML analysis!")

                print(f"="*60)

            except Exception as e:
                print(f"❌ Error processing research batch {current_batch_num}: {e}")
                print(f"⚠️  You can try running this cell again or continue to next batch")

        else:
            print(f"❌ No documents to process for research batch {current_batch_num}")

🚀 INTERACTIVE RESEARCH GRAPH EXTRACTION
🧠 Dataset: GPT-4o Validated | Extraction: GPT-4o-mini
🎯 Focus: Construction + AI/ML Ecosystem
🎉 ALL RESEARCH BATCHES COMPLETED!
✅ All 15 research batches processed successfully
🧠 GPT-4o validated dataset → Clean knowledge graph

📊 FINAL RESEARCH GRAPH STATISTICS:
  Total Nodes: 0
  Total Relationships: 0

🎯 RESEARCH METRICS:
  📊 Jobs processed: 282 (GPT-4o validated)
  🏗️  Entities per job: 0.0
  🔗 Relationships per job: 0.0
  🧠 Dataset quality: Research-grade
