# Analyse ESCO-Framework, Neo4j Graph DB

## Libraries and settings

In [None]:
# Libraries
import os
import json
import requests
import warnings
import pandas as pd
from neo4j import GraphDatabase
from pprint import pprint
from SPARQLWrapper import SPARQLWrapper, JSON

# Connection details
URI = "bolt://localhost:7687"
USER = "neo4j"
PASSWORD = "Xc54gH6*+?5gHt5"
CHUNK_SIZE = 1000

# Path
path = '/home/ec2-user/SageMaker/skill_framework'

# Ignore warnings
warnings.filterwarnings("ignore")

# Show current working directory
print(os.getcwd())

## Import ESCO data

In [None]:
# Path to the ESCO data
path_data = path + '/data/esco/ESCO dataset - v1.2.0 - classification - de - csv/'

# Read ESCO data
ski = pd.read_csv(path_data + 'skills_de.csv')
occ = pd.read_csv(path_data + 'occupations_de.csv')
osr = pd.read_csv(path_data + 'occupationSkillRelations_de.csv')

# Select relevant columns
ski_col = ['conceptUri', 
            'skillType', 
            'reuseLevel', 
            'preferredLabel', 
            'description']
occ_col = ['conceptUri', 
            'preferredLabel', 
            'description', 
            'code']
osr_col = ['occupationUri', 
            'relationType', 
            'skillType', 
            'skillUri']

# Filter relevant columns
ski = ski[ski_col]
occ = occ[occ_col]
osr = osr[osr_col]

# Show dimensions
print('Skills:', ski.shape)
print('Occupations:', occ.shape)
print('Occupation-Skill Relations:', osr.shape)


## Categories in the data

In [None]:
# Show skill types
print('Skill types:', ski.skillType.unique())

# Show reuse levels
print('Reuse levels:', ski.reuseLevel.unique())

## Clean data

In [None]:
# Clean the osr DataFrame
osr['skillType'] = osr['skillType'].fillna('Unknown')
osr = osr.dropna(subset=['occupationUri', 'skillUri', 'relationType'])

# Verify the cleaning process
print('Cleaned osr DataFrame:')
print(osr.isna().sum())

## Store data in graph data base

In [None]:
# Create a Neo4j driver instance
driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))

def batch_insert(tx, query, data):
    tx.run(query, rows=data)

with driver.session() as session:
    # 🔧 Create indexes
    session.run("CREATE INDEX IF NOT EXISTS FOR (s:Skill) ON (s.uri)")
    session.run("CREATE INDEX IF NOT EXISTS FOR (o:Occupation) ON (o.uri)")

    # Insert Skills
    skill_query = """
    UNWIND $rows AS row
    MERGE (s:Skill {uri: row.uri})
    SET s.label = row.label, s.type = row.type, s.reuseLevel = row.reuse, s.description = row.desc
    """
    for i in range(0, len(ski), CHUNK_SIZE):
        chunk = ski.iloc[i:i+CHUNK_SIZE]
        rows = chunk.rename(columns={
            'conceptUri': 'uri',
            'preferredLabel': 'label',
            'skillType': 'type',
            'reuseLevel': 'reuse',
            'description': 'desc'
        }).to_dict('records')
        session.write_transaction(batch_insert, skill_query, rows)

    # Insert Occupations
    occupation_query = """
    UNWIND $rows AS row
    MERGE (o:Occupation {uri: row.uri})
    SET o.label = row.label, o.description = row.desc, o.code = row.code
    """
    for i in range(0, len(occ), CHUNK_SIZE):
        chunk = occ.iloc[i:i+CHUNK_SIZE]
        rows = chunk.rename(columns={
            'conceptUri': 'uri',
            'preferredLabel': 'label',
            'description': 'desc',
            'code': 'code'
        }).to_dict('records')
        session.write_transaction(batch_insert, occupation_query, rows)

    # Insert Relationships
    relation_query = """
    UNWIND $rows AS row
    MATCH (o:Occupation {uri: row.occ_uri})
    MATCH (s:Skill {uri: row.skill_uri})
    MERGE (o)-[:REQUIRES {relationType: row.rel, skillType: row.stype}]->(s)
    """
    for i in range(0, len(osr), CHUNK_SIZE):
        chunk = osr.iloc[i:i+CHUNK_SIZE]
        rows = chunk.rename(columns={
            'occupationUri': 'occ_uri',
            'skillUri': 'skill_uri',
            'relationType': 'rel',
            'skillType': 'stype'
        }).to_dict('records')
        session.write_transaction(batch_insert, relation_query, rows)

driver.close()

## Functions

In [None]:
# Function to retrieve skills for a specific job
def get_skills_for_job(session, job_uri):
    query = """
    MATCH (o:Occupation {uri: $job_uri})-[:REQUIRES]->(s:Skill)
    RETURN s.uri AS skillUri, s.label AS skillLabel, s.type AS skillType
    ORDER BY s.label
    """
    result = session.run(query, job_uri=job_uri)
    return [record.data() for record in result]

# Function to retrieve jobs for a specific skill
def get_jobs_for_skill(session, skill_uri):
    query = """
    MATCH (s:Skill {uri: $skill_uri})<-[:REQUIRES]-(o:Occupation)
    RETURN o.uri AS jobUri,
           o.label AS jobLabel,
           o.code AS jobCode,
           s.reuseLevel AS reuseLevel
    ORDER BY o.label
    """
    result = session.run(query, skill_uri=skill_uri)
    return [record.data() for record in result]

# Query: Group skillTypes by reuseLevel
def get_skill_types_grouped_by_reuse(tx, occupation_uri):
    query = """
    MATCH (o:Occupation {uri: $occ_uri})-[:REQUIRES]->(s:Skill)
    WHERE s.reuseLevel IS NOT NULL AND s.skillType IS NOT NULL
    WITH s.reuseLevel AS reuseLevel, s.skillType AS skillType
    RETURN reuseLevel, collect(DISTINCT skillType) AS skillTypes
    ORDER BY reuseLevel
    """
    result = tx.run(query, occ_uri=occupation_uri)
    return [record.data() for record in result]


## Example usage

### Retrieve all skills for a specific job

In [None]:
# Create a Neo4j driver instance
driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))

# Define occupation uri
occupation_uri = "http://data.europa.eu/esco/occupation/000e93a3-d956-4e45-aacb-f12c83fedf84"

# Retrieve skills for a specific job
with driver.session() as session:
    skills = get_skills_for_job(session, occupation_uri)
    for skill in skills:
        print(skill)

# Close the Neo4j driver
driver.close()

### Retrieve all jobs for a specific skill

In [None]:
# Create a Neo4j driver instance
driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))

# Select skill uri
skill_uri = "http://data.europa.eu/esco/skill/9ea652f6-accf-45bb-b9a8-590f06f7bd51"

# Retrieve jobs for a specific skill
with driver.session() as session:
    jobs = get_jobs_for_skill(session, skill_uri)
    for job in jobs:
        print(job)

# Close the Neo4j driver
driver.close()