In [13]:
from neo4j import GraphDatabase
import json
import os
import random
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [16]:
URI = os.getenv('NEO4J_CONNECTION_URI')
AUTH = (os.getenv('NEO4J_USERNAME'), os.getenv('NEO4J_PASSWORD'))

In [17]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            result = session.run("RETURN 'That is right.' AS message")
            record = result.single()
            if record:
                print(record["message"])
            else:
                print("Query returned no result.")
except Exception as e:
    print(f"Connection or query failed: {e}")

That is right.


In [9]:
os.listdir('clean_data')

['papers',
 'funders',
 'topics',
 'authors',
 'domains.json',
 'fields.json',
 'institutions',
 'sources',
 'subfields.json']

In [11]:
with open('clean_data/domains.json', 'r', encoding='utf-8') as f:
    domains_data = json.load(f)
len(domains_data)

4

In [13]:
print(json.dumps(domains_data[0], indent=4))

{
    "id": "https://openalex.org/domains/3",
    "name": "Physical Sciences",
    "description": "branch of natural science that studies non-living systems",
    "fields": [
        "https://openalex.org/fields/15",
        "https://openalex.org/fields/16",
        "https://openalex.org/fields/17",
        "https://openalex.org/fields/19",
        "https://openalex.org/fields/21",
        "https://openalex.org/fields/22",
        "https://openalex.org/fields/23",
        "https://openalex.org/fields/25",
        "https://openalex.org/fields/26",
        "https://openalex.org/fields/31"
    ],
    "worksCount": 73237366,
    "citedByCount": 770120022
}


In [14]:
def import_domains(driver, domains_list):
    print(f"Starting import of {len(domains_list)} domains...")
    
    with driver.session() as session:

        total_imported = 0
        for domain in domains_list:
            props = domain.copy()
            
            if 'fields' in props:
                del props['fields']
            
            domain_id = props.pop('id', None)

            if not domain_id:
                print(f"Skipping domain, missing 'id': {domain}")
                continue

            try:
                session.run("""
                    MERGE (d:Domain {id: $id_param})
                    SET d += $props_param
                    """, id_param=domain_id, props_param=props)
                total_imported += 1
            except Exception as e:
                print(f"Failed to import domain {domain_id}: {e}")

    print(f"\nImport complete. {total_imported} domains merged/updated.")

In [15]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        import_domains(driver, domains_data)
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!
Starting import of 4 domains...

Import complete. 4 domains merged/updated.


In [29]:
query = """
MATCH (d:Domain)
RETURN d.name AS name, d.description AS description, d.worksCount AS worksCount
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [30]:
[rec.data() for rec in records]

[{'name': 'Physical Sciences',
  'description': 'branch of natural science that studies non-living systems',
  'worksCount': 73237366},
 {'name': 'Social Sciences',
  'description': 'branch of science focused on societies and the relationships among individuals within those societies',
  'worksCount': 64995390},
 {'name': 'Health Sciences',
  'description': 'branch of science focused on human health and disease prevention, diagnosis, treatment, and management',
  'worksCount': 43957129},
 {'name': 'Life Sciences',
  'description': 'branch of science that involve the scientific study of life – such as microorganisms, plants, and animals including human beings',
  'worksCount': 27536918}]

In [31]:
with open('clean_data/fields.json', 'r', encoding='utf-8') as f:
    fields_data = json.load(f)
len(fields_data)

26

In [32]:
print(json.dumps(fields_data[0], indent=4))

{
    "id": "https://openalex.org/fields/18",
    "name": "Decision Sciences",
    "description": "branch of applied probability theory",
    "nameAlternatives": [
        "decision theory",
        "theory of choice",
        "operations research"
    ],
    "domainId": "https://openalex.org/domains/2",
    "subfields": [
        "https://openalex.org/subfields/1800",
        "https://openalex.org/subfields/1802",
        "https://openalex.org/subfields/1803",
        "https://openalex.org/subfields/1804"
    ],
    "worksCount": 2096477,
    "citedByCount": 29367741
}


In [34]:
def import_fields(driver, fields_list):
    print(f"Starting import of {len(fields_list)} fields...")
    
    with driver.session() as session:

        total_imported = 0
        for field in fields_list:
            props = field.copy()
            
            if 'subfields' in props:
                del props['subfields']
            if 'domainId' in props:
                del props['domainId']
            
            field_id = props.pop('id', None)

            if not field_id:
                print(f"Skipping field, missing 'id': {field}")
                continue

            try:
                session.run("""
                    MERGE (d:Field {id: $id_param})
                    SET d += $props_param
                    """, id_param=field_id, props_param=props)
                total_imported += 1
            except Exception as e:
                print(f"Failed to import field {fields_list}: {e}")

    print(f"\nImport complete. {total_imported} field merged/updated.")

In [35]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        import_fields(driver, fields_data)
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!
Starting import of 26 fields...

Import complete. 26 field merged/updated.


In [39]:
query = """
MATCH (f:Field)
RETURN f.name AS name, f.description as description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [44]:
records[4].data()

{'name': 'Engineering',
 'description': 'practice of using natural science, mathematics, and the engineering design process to solve technical problems'}

In [4]:
with open('clean_data/subfields.json', 'r', encoding='utf-8') as f:
    subfields_data = json.load(f)
len(subfields_data)

248

In [49]:
print(json.dumps(subfields_data[0], indent=4))

{
    "id": "https://openalex.org/subfields/2103",
    "name": "Fuel Technology",
    "description": "technology used in the extraction, processing and consumption of fuel resources",
    "nameAlternatives": [],
    "domainId": "https://openalex.org/domains/3",
    "fieldId": "https://openalex.org/fields/21",
    "topics": [
        "https://openalex.org/T14428"
    ],
    "worksCount": 46298,
    "citedByCount": 107545
}


In [3]:
def import_subfields(driver, subfields_list):
    print(f"Starting import of {len(subfields_list)} subfields...")
    
    with driver.session() as session:

        total_imported = 0
        for subfield in subfields_list:
            props = subfield.copy()
            

            if 'fieldId' in props:
                del props['fieldId']
            if 'domainId' in props:
                del props['domainId']
            if 'topics' in props:
                del props['topics']
            
            subfield_id = props.pop('id', None)

            if not subfield_id:
                print(f"Skipping subfield, missing 'id': {subfield}")
                continue

            try:
                session.run("""
                    MERGE (d:SubField {id: $id_param})
                    SET d += $props_param
                    """, id_param=subfield_id, props_param=props)
                total_imported += 1
            except Exception as e:
                print(f"Failed to import field {subfields_list}: {e}")

    print(f"\nImport complete. {total_imported} subfields merged/updated.")

In [5]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        import_subfields(driver, subfields_data)
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!
Starting import of 248 subfields...

Import complete. 248 subfields merged/updated.


In [16]:
query = """
MATCH (s:SubField)
RETURN s.name AS name, s.description AS description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [20]:
records[50].data()

{'name': 'Information Systems',
 'description': 'academic study of systems with a specific reference to information and the complementary networks of hardware and software that people and organizations use to collect, filter, process, create and also distribute data'}

In [7]:
topics_data = []
topics_dir = os.path.join('clean_data', 'topics')

for fname in os.listdir(topics_dir):
    with open(os.path.join(topics_dir, fname), 'r', encoding='utf-8') as f:
        topics_data.extend(json.load(f))

random.shuffle(topics_data)
len(topics_data)

3655

In [8]:
print(json.dumps(topics_data[0], indent=4))

{
    "id": "https://openalex.org/T13722",
    "name": "Ergonomics and Human Factors",
    "description": "This cluster of papers explores the intersection of ergonomics and sustainability, focusing on human-compatible systems, work systems, sustainable development, environmental technology, and the application of agent-based simulation and digitalization. It delves into topics such as green ergonomics, polymer coatings, and the role of human factors in building design for sustainable work activities.",
    "keywords": [
        "Ergonomics",
        "Sustainability",
        "Human Factors",
        "Green Ergonomics",
        "Work Systems",
        "Sustainable Development",
        "Environmental Technology",
        "Agent-Based Simulation",
        "Digitalization",
        "Polymer Coatings"
    ],
    "domainId": "https://openalex.org/domains/3",
    "fieldId": "https://openalex.org/fields/22",
    "subfieldId": "https://openalex.org/subfields/2204",
    "worksCount": 12838,
  

In [None]:
def import_topics_batch(driver, topics_list, ignore_key):
    print(f"Starting import of {len(topics_list)} topics...")
    
    props_list = []
    for topic in topics_list:
        props = topic.copy()

        for key in ignore_key:
            if key in props:
                del props[key]

        topic_id = props.pop('id', None)
        if topic_id:
            props_list.append({
                'id_param': topic_id,
                'props_param': props
            })


    with driver.session() as session:
        try:
            result = session.run("""
                UNWIND $props_list AS item
                MERGE (t:Topic {id: item.id_param})
                SET t += item.props_param
                RETURN count(t) AS total_merged
                """, props_list=props_list)
            
            summary = result.single()
            print(f"Batch import complete. {summary['total_merged']} topics merged/updated.")

        except Exception as e:
            print(f"Failed to import batch: {e}")

In [None]:
N_topics = len(topics_data)
batch_size = 250

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        count = 1
        for i in range(0, N_topics, batch_size):
            selected_records = topics_data[i: i+batch_size]
            print(f'{count}:')
            import_topics_batch(driver, selected_records, ["domainId", "fieldId", "subfieldId"])
            count += 1
            print(2*'\n')
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [21]:
# query = """
# MATCH (t:Topic)
# RETURN t.id AS id
# """

query = """
MATCH (t:Topic)
RETURN t.name AS name, t.description AS description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [22]:
# len(set([t.data().get('id') for t in records]))

In [24]:
records[16].data()

{'name': 'Economic Theory and Institutions',
 'description': 'This cluster of papers explores the study of economic institutions, behavior, and evolution, encompassing topics such as institutional economics, evolutionary economics, behavioral economics, market evolution, property rights, macroeconomics, social evolution, neoliberalism, and Darwinism in economics.'}

In [3]:
with open('clean_data/domains.json', 'r', encoding='utf-8') as f:
    domains_data = json.load(f)
len(domains_data)

4

In [4]:
domains_data[0]

{'id': 'https://openalex.org/domains/3',
 'name': 'Physical Sciences',
 'description': 'branch of natural science that studies non-living systems',
 'fields': ['https://openalex.org/fields/15',
  'https://openalex.org/fields/16',
  'https://openalex.org/fields/17',
  'https://openalex.org/fields/19',
  'https://openalex.org/fields/21',
  'https://openalex.org/fields/22',
  'https://openalex.org/fields/23',
  'https://openalex.org/fields/25',
  'https://openalex.org/fields/26',
  'https://openalex.org/fields/31'],
 'worksCount': 73237366,
 'citedByCount': 770120022}

In [5]:
domain_fields_relation = [
    {
        'domain_id_param': domain.get('id'),
        'field_ids_param': domain.get('fields'),
    } for domain in domains_data
]

In [7]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        with driver.session() as session:

            try:
                result = session.run("""
                    UNWIND $data_list AS item
                    
                    // 1. Find the existing Domain (fast due to constraint)
                    MATCH (d:Domain {id: item.domain_id_param})
                    
                    // 2. Unwind the list of field IDs for this domain
                    WITH d, item.field_ids_param AS field_ids
                    UNWIND field_ids AS field_id
                    
                    // 3. Find the existing Field (fast due to constraint)
                    MATCH (f:Field {id: field_id})
                    
                    // 4. Create the relationship (MERGE avoids duplicates)
                    MERGE (f)-[r:PART_OF]->(d)
                    
                    RETURN count(r) AS relationships_created
                    """, data_list=domain_fields_relation)
                
                summary = result.single()
                print(f"\nBatch import complete.")
                print(f"  - {summary['relationships_created']} relationships created/merged.")

            except Exception as e:
                # This will fail if a :Domain or :Field node is not found
                print(f"Failed to import batch: {e}")
                print("NOTE: This query assumes all :Domain and :Field nodes already exist.")
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!

Batch import complete.
  - 26 relationships created/merged.


In [10]:
query = """
MATCH (f:Field)-[:PART_OF]->(d:Domain {name: 'Physical Sciences'})
RETURN f.name AS field_name, f.description AS description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [11]:
[rec.data() for rec in records]

[{'field_name': 'Physics and Astronomy',
  'description': 'study of matter and its motion, along with related concepts such as energy and force'},
 {'field_name': 'Mathematics',
  'description': 'area of knowledge that includes the topics of numbers, formulas and related structures, shapes and the spaces in which they are contained, and quantities and their changes'},
 {'field_name': 'Materials Science',
  'description': 'research, discovery and design of physical materials (especially solids)'},
 {'field_name': 'Environmental Science',
  'description': 'interdisciplinary field that studies human interaction with the environment'},
 {'field_name': 'Engineering',
  'description': 'practice of using natural science, mathematics, and the engineering design process to solve technical problems'},
 {'field_name': 'Energy',
  'description': 'study of energy in physical systems, its sources, technology, management, and impact on sustainability'},
 {'field_name': 'Earth and Planetary Sciences',

In [3]:
with open('clean_data/fields.json', 'r', encoding='utf-8') as f:
    fields_data = json.load(f)
len(fields_data)

26

In [4]:
fields_data[16]

{'id': 'https://openalex.org/fields/19',
 'name': 'Earth and Planetary Sciences',
 'description': 'all fields of natural science related to the planet Earth and other planets',
 'nameAlternatives': ['geosciences', 'planetary sciences'],
 'domainId': 'https://openalex.org/domains/3',
 'subfields': ['https://openalex.org/subfields/1902',
  'https://openalex.org/subfields/1904',
  'https://openalex.org/subfields/1906',
  'https://openalex.org/subfields/1907',
  'https://openalex.org/subfields/1908',
  'https://openalex.org/subfields/1910',
  'https://openalex.org/subfields/1911',
  'https://openalex.org/subfields/1912'],
 'worksCount': 4334720,
 'citedByCount': 65830936}

In [5]:
fields_subfields_relation = [
    {
        'field_id_param': field.get('id'),
        'subfield_ids_param': field.get('subfields'),
    } for field in fields_data
]
len(fields_subfields_relation)

26

In [8]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        with driver.session() as session:

            try:
                result = session.run("""
                    UNWIND $data_list AS item
                    
                    MATCH (f:Field {id: item.field_id_param})
                    
                    WITH f, item.subfield_ids_param AS subfield_ids
                    UNWIND subfield_ids AS subfield_id
                    
                    MATCH (s:SubField {id: subfield_id})
                    
                    MERGE (s)-[r:PART_OF]->(f)
                    
                    RETURN count(r) AS relationships_created
                    """, data_list=fields_subfields_relation)
                
                summary = result.single()
                print(f"\nBatch import complete.")
                print(f"  - {summary['relationships_created']} relationships created/merged.")

            except Exception as e:
                # This will fail if a :Domain or :Field node is not found
                print(f"Failed to import batch: {e}")
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!

Batch import complete.
  - 248 relationships created/merged.


In [10]:
query = """
MATCH (s:SubField)-[:PART_OF]->(f:Field {name: 'Engineering'})
RETURN s.name AS field_name, s.description AS description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [11]:
print(json.dumps([rec.data() for rec in records], indent=4))

[
    {
        "field_name": "Safety, Risk, Reliability and Quality",
        "description": "state of being secure from harm, injury, danger, or other non-desirable outcomes"
    },
    {
        "field_name": "Mechanics of Materials",
        "description": "behavior of solid objects subject to stresses and strains"
    },
    {
        "field_name": "Ocean Engineering",
        "description": "engineering of devices within the ocean environment"
    },
    {
        "field_name": "Building and Construction",
        "description": "former West German trade union (1949\u20131995)"
    },
    {
        "field_name": "Mechanical Engineering",
        "description": "engineering discipline"
    },
    {
        "field_name": "Computational Mechanics",
        "description": "discipline concerned with the use of computational methods to study mechanics"
    },
    {
        "field_name": "General Engineering",
        "description": "applied science"
    },
    {
        "field_name": "

In [18]:
with open('clean_data/subfields.json', 'r', encoding='utf-8') as f:
    subfields_data = json.load(f)
len(subfields_data)

248

In [21]:
subfields_data[2].get('topics')

['https://openalex.org/T13509',
 'https://openalex.org/T12378',
 'https://openalex.org/T13366',
 'https://openalex.org/T10081',
 'https://openalex.org/T14183',
 'https://openalex.org/T14419',
 'https://openalex.org/T13972',
 'https://openalex.org/T14239',
 'https://openalex.org/T10019',
 'https://openalex.org/T12509',
 'https://openalex.org/T11508',
 'https://openalex.org/T12123',
 'https://openalex.org/T11653',
 'https://openalex.org/T13744',
 'https://openalex.org/T10517',
 'https://openalex.org/T11365',
 'https://openalex.org/T14180',
 'https://openalex.org/T14138',
 'https://openalex.org/T11903',
 'https://openalex.org/T12276',
 'https://openalex.org/T14030',
 'https://openalex.org/T12544']

In [22]:
subfields_topics_relation = [
    {
        'subfields_id_param': subfield.get('id'),
        'topics_ids_param': subfield.get('topics'),
    } for subfield in subfields_data
]
len(subfields_topics_relation)

248

In [24]:
subfields_topics_relation[2]

{'subfields_id_param': 'https://openalex.org/subfields/1402',
 'topics_ids_param': ['https://openalex.org/T13509',
  'https://openalex.org/T12378',
  'https://openalex.org/T13366',
  'https://openalex.org/T10081',
  'https://openalex.org/T14183',
  'https://openalex.org/T14419',
  'https://openalex.org/T13972',
  'https://openalex.org/T14239',
  'https://openalex.org/T10019',
  'https://openalex.org/T12509',
  'https://openalex.org/T11508',
  'https://openalex.org/T12123',
  'https://openalex.org/T11653',
  'https://openalex.org/T13744',
  'https://openalex.org/T10517',
  'https://openalex.org/T11365',
  'https://openalex.org/T14180',
  'https://openalex.org/T14138',
  'https://openalex.org/T11903',
  'https://openalex.org/T12276',
  'https://openalex.org/T14030',
  'https://openalex.org/T12544']}

In [25]:
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()
        print("✅ Connection successful!")
        
        with driver.session() as session:

            try:
                result = session.run("""
                    UNWIND $data_list AS item
                    
                    MATCH (s:SubField {id: item.subfields_id_param})
                    
                    WITH s, item.topics_ids_param AS topic_ids
                    UNWIND topic_ids AS topic_id
                    
                    MATCH (t:Topic {id: topic_id})
                    
                    MERGE (t)-[r:PART_OF]->(s)
                    
                    RETURN count(r) AS relationships_created
                    """, data_list=subfields_topics_relation)
                
                summary = result.single()
                print(f"\nBatch import complete.")
                print(f"  - {summary['relationships_created']} relationships created/merged.")

            except Exception as e:
                # This will fail if a :Domain or :Field node is not found
                print(f"Failed to import batch: {e}")
        
except Exception as e:
    print(f"❌ An error occurred: {e}")

✅ Connection successful!

Batch import complete.
  - 3655 relationships created/merged.


In [26]:
query = """
MATCH (t:Topic)-[:PART_OF]->(s:SubField {name: 'Control and Systems Engineering'})
RETURN t.name AS name, t.description AS description
"""

try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.verify_connectivity()

        records, summary, keys = driver.execute_query(query, database_="neo4j")
                
except Exception as e:
    print(f"❌ An error occurred: {e}")

In [27]:
print(json.dumps([rec.data() for rec in records], indent=4))

[
    {
        "name": "Magnetic Bearings and Levitation Dynamics",
        "description": "This cluster of papers focuses on the dynamics, control, and applications of magnetic levitation systems, including maglev trains, bearingless motors, active magnetic bearings, spacecraft attitude control using control moment gyros, vibration analysis of rotor systems, and fault diagnosis in rotating machinery."
    },
    {
        "name": "Extremum Seeking Control Systems",
        "description": "This cluster of papers focuses on extremum seeking control, a method for optimizing the performance of dynamic systems by iteratively adjusting control inputs to seek the extremum of a cost function. The papers cover various aspects such as stability analysis, adaptive control, stochastic optimization, and applications in fields like wind energy, bioreactors, and autonomous vehicles."
    },
    {
        "name": "Human Motion and Animation",
        "description": "This cluster of papers explores v