In [172]:
import pandas as pd
from neo4j import GraphDatabase

In [173]:
#df = pd.read_json('first_1000_rows.json', lines=True)

df = pd.read_csv('updated_data_new.csv')

print(df.shape)

(500, 33)


In [174]:
df.columns

Index(['Unnamed: 0', 'id', 'submitter', 'authors', 'title', 'comments',
       'journal-ref', 'doi', 'report-no', 'categories', 'license', 'versions',
       'update_date', 'authors_parsed', 'publisher', 'published-print',
       'published-online', 'reference-count', 'type', 'bibtex', 'subject',
       'publication_type', 'field_of_study', 'refined_authors',
       'refined_publication_title', 'resolved_venue', 'refined_venue',
       'authors_with_gender', 'publication_year', 'journal_name', 'volume',
       'issue', 'pages'],
      dtype='object')

In [175]:
#display versions col of df
df['versions'][0]

"[{'version': 'v1', 'created': 'Tue, 13 Nov 2007 13:02:17 GMT'}]"

In this view, you need to establish what constitutes nodes and relationships (edges). For example, authors, papers, and journals, or other publication venues are good candidates to act as nodes in the graph. Edges can represent authorship between an author and a paper, co authorship among authors, works-for between author and affiliation, cites relationship among papers, etc. You need to come up with your proposal. For graph analytics tasks, you can find influential papers. You can use Page rank for this using the citation relation. Detecting communities by finding strongly connected components in relationships like co-authorship or being a member of the same scientific domain. There are readily available libraries for graph analytics in Neo4J.

In [176]:
from neo4j import GraphDatabase
import re
import ast

class Neo4jDatabase:
    def __init__(self):
        self._uri = "bolt://localhost:7687"
        self._driver = GraphDatabase.driver(self._uri)
        self.clear_database()

    
    def clear_database(self):
        with self._driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")

    def close(self):
        if self._driver is not None:
            self._driver.close()
    
    def create_nodes_and_relationships(self, df):
        with self._driver.session() as session:
            try:
                for index, row in df.iterrows():
                    # Publication node
                    session.run(
                        "MERGE (p:Publication {publication_id: $publication_id}) "
                        "SET p.title = $title, p.doi = $doi",
                        {
                            "publication_id": row['id'],
                            "title": row['title'] if not pd.isna(row['title']) else None,
                            "doi": row['doi'] if not pd.isna(row['doi']) else None
                        }
                    )

                    # Journal node
                    if row['journal-ref']:
                        session.run(
                            "MERGE (j:Journal {journal_id: $journal_id}) "
                            "SET j.journal_ref = $journal_ref",
                            {"journal_id": index, "journal_ref": row['journal-ref']}
                        )
                        # PUBLISHED_IN relationship
                        session.run(
                            "MATCH (p:Publication {publication_id: $publication_id}), (j:Journal {journal_id: $journal_id}) "
                            "MERGE (p)-[:PUBLISHED_IN]->(j)",
                            {"publication_id": row['id'], "journal_id": index}
                        )
                    
                    # Authors node
                    if row['authors']:
                        authors_list = re.split(r',| and ', row['authors'])
                        authors = [author.strip() for author in authors_list]
                        for author in authors:
                            session.run(
                                "MERGE (a:Author {author_id: $author_id}) "
                                "SET a.name = $author_name",
                                {"author_id": author, "author_name": author}
                            )
                            # AUTHORED_BY relationship
                            session.run(
                                "MATCH (p:Publication {publication_id: $publication_id}), (a:Author {author_id: $author_id}) "
                                "MERGE (a)-[:AUTHORED_BY]->(p)",
                                {"publication_id": row['id'], "author_id": author}
                            )
                            # CO_AUTHORED_BY relationship
                            for co_author in authors:
                                if co_author != author:
                                    session.run(
                                        "MATCH (a1:Author {author_id: $author_id1}), (a2:Author {author_id: $author_id2}) "
                                        "MERGE (a1)-[:CO_AUTHORED_BY]->(a2)",
                                        {"author_id1": author, "author_id2": co_author}
                                    )
                    
                    # Categories node
                    if row['categories']:
                        categories_list = re.split(r'\s|\.', row['categories'])
                        categories = [category.strip() for category in categories_list]
                        for category in categories:
                            session.run(
                                "MERGE (c:Category {category_id: $category_id}) "
                                "SET c.name = $category_name",
                                {"category_id": category, "category_name": category}
                            )
                            # HAS_CATEGORY relationship
                            session.run(
                                "MATCH (p:Publication {publication_id: $publication_id}), (c:Category {category_id: $category_id}) "
                                "MERGE (p)-[:HAS_CATEGORY]->(c)",
                                {"publication_id": row['id'], "category_id": category}
                            )

                    # Version node
                    if row['versions']:
                        # Parse the versions column
                        versions_list = ast.literal_eval(row['versions'])
                        for version_dict in versions_list:
                            version = version_dict.get('version')
                            created_date = version_dict.get('created')
                            session.run(
                                "MERGE (v:Version {version_id: $version_id}) "
                                "SET v.version = $version, v.year = $publication_year, v.created_date = $created_date",
                                {"version_id": version, "version": version, "publication_year": row['publication_year'],
                                "created_date": created_date}
                            )
                            # HAS_VERSION relationship
                            session.run(
                                "MATCH (p:Publication {publication_id: $publication_id}), (v:Version {version_id: $version_id}) "
                                "MERGE (p)-[:HAS_VERSION]->(v)",
                                {"publication_id": row['id'], "version_id": version}
                            )

                    # Date node
                    if row['update_date']:
                        session.run(
                            "MERGE (d:Date {date_id: $date_id}) "
                            "SET d.update_date = $update_date",
                            {"date_id": row['id'], "update_date": row['update_date']}
                        )
                        # UPDATED_ON relationship
                        session.run(
                            "MATCH (p:Publication {publication_id: $publication_id}), (d:Date {date_id: $date_id}) "
                            "MERGE (p)-[:UPDATED_ON]->(d)",
                            {"publication_id": row['id'], "date_id": row['id']}
                        )

                    # License node
                    if row['license']:
                        session.run(
                            "MERGE (l:License {license_id: $license_id}) "
                            "SET l.license = $license",
                            {"license_id": row['id'], "license": row['license']}
                        )
                        # LICENSED_UNDER relationship
                        session.run(
                            "MATCH (p:Publication {publication_id: $publication_id}), (l:License {license_id: $license_id}) "
                            "MERGE (p)-[:LICENSED_UNDER]->(l)",
                            {"publication_id": row['id'], "license_id": row['id']}
                        )
                    
            except Exception as e:
                print(f"An error occurred: {e}")

In [177]:
df.columns

Index(['Unnamed: 0', 'id', 'submitter', 'authors', 'title', 'comments',
       'journal-ref', 'doi', 'report-no', 'categories', 'license', 'versions',
       'update_date', 'authors_parsed', 'publisher', 'published-print',
       'published-online', 'reference-count', 'type', 'bibtex', 'subject',
       'publication_type', 'field_of_study', 'refined_authors',
       'refined_publication_title', 'resolved_venue', 'refined_venue',
       'authors_with_gender', 'publication_year', 'journal_name', 'volume',
       'issue', 'pages'],
      dtype='object')

In [178]:
neo4j_db = Neo4jDatabase()

In [179]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,...,refined_authors,refined_publication_title,resolved_venue,refined_venue,authors_with_gender,publication_year,journal_name,volume,issue,pages
0,21281,711.1971,Justin David R,Justin R. David,On the dyon partition function in N=2 theories,48 pages,"JHEP0802:025,2008",10.1088/1126-6708/2008/02/025,,hep-th,...,Justin R David,On the dyon partition function in 𝒩 = 2 theories,Unknown,Unknown,Justin R David (male),2008.0,Journal of High Energy Physics,2008,02,025-025
1,102616,1001.3382,Eiji Konishi,Eiji Konishi,Modeling M-Theory Vacua via Gauged S-Duality,"31 pages, version reflecting the erratum. arXi...","Int.J.Mod.Phys.A26:4785-4816,2011;\r\n Int.J....",10.1142/S0217751X11054693,,hep-th,...,EIJI KONISHI,MODELING M-THEORY VACUA VIA GAUGED S-DUALITY,Unknown,Unknown,EIJI KONISHI (male),2011.0,International Journal of Modern Physics A,26,27n28,4785-4816
2,121054,1007.0229,Marcelo Kuperman,M. N. Kuperman,A model for the emergence of geopolitical divi...,,"Phil. Trans. R. Soc. A 2010 368, 5695-5706",10.1098/rsta.2010.0263,,nlin.AO,...,M. N. Kuperman,A model for the emergence of geopolitical divi...,Unknown,Unknown,M. N. Kuperman (male),2010.0,Philosophical Transactions of the Royal Societ...,368,1933,5695-5706
3,23046,711.4945,Hrachya Nersisyan,"H.B. Nersisyan, D.A. Osipyan and G. Zwicknagel",Renormalized cluster expansion of the microfie...,"17 pages, 10 figures, submitted to Physical Re...",,10.1103/PhysRevE.77.056409,,physics.plasm-ph,...,"H. B. Nersisyan, D. A. Osipyan, G. Zwicknagel",Renormalized cluster expansion of the microfie...,Unknown,Unknown,"H. B. Nersisyan (male), D. A. Osipyan (male), ...",2008.0,Physical Review E,77,5,
4,69809,903.2896,Myung Joon Han,Myung Joon Han and Sergey Y. Savrasov,"Doping Driven ($\pi, 0$) Nesting and Magnetic ...",Table updated and discussion included,"Phys. Rev. Lett. 103, 067001 (2009)",10.1103/PhysRevLett.103.067001,,cond-mat.supr-con cond-mat.mtrl-sci,...,"Myung Joon Han, Sergey Y. Savrasov","Doping Driven (<mml:math xmlns:mml=""http://www...",Unknown,Unknown,"Myung Joon Han (female), Sergey Y. Savrasov (m...",2009.0,Physical Review Letters,103,6,


In [180]:
neo4j_db.create_nodes_and_relationships(df)

In [181]:
#neo4j_db.clear_database()

In [182]:
#neo4j_db.close()