In [None]:
pip install neo4j pandas

In [1]:
import pandas as pd
from neo4j import GraphDatabase
import re
import time

# Neo4j connection details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password" # your password to the local DBMS you created on Neo4j Desktop where you want to upload the data

In [4]:
%%time

# Load the datasets
citations_df = pd.read_csv('sampled_citations.csv')
metadata_df = pd.read_csv('sampled_citations_metadata_clean.csv')

# Initialize the Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# Function to clean author names
def clean_authors(authors_str):
    if isinstance(authors_str, str):
        authors = [author.strip() for author in authors_str.split(';')]
        return authors
    return []

# Clean author names in metadata
metadata_df['author'] = metadata_df['author'].apply(clean_authors)

# Create the graph structure in Neo4j
def create_publication(tx, pub_omid, title, pub_year, pub_month, pub_day, venue, publisher):
    # Create a publication node using omid as the identifier
    tx.run("""
    MERGE (p:Publication {omid: $pub_omid})
    SET p.title = $title, p.year = $pub_year, p.month = $pub_month, p.day = $pub_day, p.venue = $venue, p.publisher = $publisher
    """, pub_omid=pub_omid, title=title, pub_year=pub_year, pub_month=pub_month, pub_day=pub_day, venue=venue, publisher=publisher)

def create_author(tx, author_name):
    # Create an author node
    tx.run("""
    MERGE (a:Author {name: $author_name})
    """, author_name=author_name)

def create_authored_relationship(tx, author_name, pub_omid):
    # Create authored relationship between authors and publications
    tx.run("""
    MATCH (a:Author {name: $author_name})
    MATCH (p:Publication {omid: $pub_omid})
    MERGE (a)-[:AUTHORED]->(p)
    """, author_name=author_name, pub_omid=pub_omid)

def create_citation_relationship(tx, citing_pub_omid, cited_pub_omid):
    # Create citation relationship between publications using omid as the identifier
    tx.run("""
    MATCH (citing:Publication {omid: $citing_pub_omid})
    MATCH (cited:Publication {omid: $cited_pub_omid})
    MERGE (citing)-[:CITED]->(cited)
    """, citing_pub_omid=citing_pub_omid, cited_pub_omid=cited_pub_omid)

def upload_data():
    start_time = time.time()  # Start timer to calculate total processing time
    total_publications = len(metadata_df)
    total_citations = len(citations_df)
    
    with driver.session() as session:
        # Upload publication data
        for i, row in metadata_df.iterrows():
            pub_omid = row['omid']  # Now using omid instead of id
            title = row['title']
            pub_year = row['pub_year']
            pub_month = row['pub_month']
            pub_day = row['pub_day']
            venue = row['venue']
            publisher = row['publisher']

            # Create publication node
            session.execute_write(create_publication, pub_omid, title, pub_year, pub_month, pub_day, venue, publisher)

            # Create author nodes and relationships
            authors = row['author']
            if isinstance(authors, list):
                for author in authors:
                    session.execute_write(create_author, author)
                    session.execute_write(create_authored_relationship, author, pub_omid)

            # Time estimation for publication upload
            elapsed_time = time.time() - start_time
            processed_publications = i + 1
            remaining_publications = total_publications - processed_publications
            time_per_publication = elapsed_time / processed_publications
            remaining_time = remaining_publications * time_per_publication

            # Print time estimate for publications
            print(f"Processed {processed_publications}/{total_publications} publications. Estimated time remaining: {format_time(remaining_time)}", end="\r")

        print()  # for a new line after publications processing

        # Upload citation data (citations_df)
        for i, row in citations_df.iterrows():
            citing_pub_omid = row['citing']  # These should be omid values
            cited_pub_omid = row['cited']   # These should also be omid values

            # Create citation relationships
            session.execute_write(create_citation_relationship, citing_pub_omid, cited_pub_omid)

            # Time estimation for citation upload
            elapsed_time = time.time() - start_time
            processed_citations = i + 1
            remaining_citations = total_citations - processed_citations
            time_per_citation = elapsed_time / processed_citations
            remaining_time = remaining_citations * time_per_citation

            # Print time estimate for citations
            print(f"Processed {processed_citations}/{total_citations} citations. Estimated time remaining: {format_time(remaining_time)}", end="\r")

        print()  # for a new line after citations processing
        print("Data upload complete!")

# Function to format the time in a human-readable format
def format_time(seconds):
    if seconds < 60:
        return f"{seconds:.2f} seconds"
    elif seconds < 3600:
        minutes = seconds / 60
        return f"{minutes:.2f} minutes"
    elif seconds < 86400:
        hours = seconds / 3600
        return f"{hours:.2f} hours"
    else:
        days = seconds / 86400
        return f"{days:.2f} days"

# Run the upload
upload_data()

# Close the Neo4j driver
driver.close()


Processed 6034/6034 publications. Estimated time remaining: 0.00 secondss
Processed 5719/5719 citations. Estimated time remaining: 0.00 secondss
Data upload complete!
CPU times: user 33.4 s, sys: 6.46 s, total: 39.9 s
Wall time: 11min 15s
