In [35]:
import pandas as pd
from neo4j import GraphDatabase

In [36]:
df = pd.read_json('first_1000_rows.json', lines=True)

print(df.shape)

(1000, 14)


In [37]:
df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')

In [38]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Bal치zs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In this view, you need to establish what constitutes nodes and relationships (edges). For example, authors, papers, and journals, or other publication venues are good candidates to act as nodes in the graph. Edges can represent authorship between an author and a paper, co authorship among authors, works-for between author and affiliation, cites relationship among papers, etc. You need to come up with your proposal. For graph analytics tasks, you can find influential papers. You can use Page rank for this using the citation relation. Detecting communities by finding strongly connected components in relationships like co-authorship or being a member of the same scientific domain. There are readily available libraries for graph analytics in Neo4J.

In [41]:
from neo4j import GraphDatabase

class Neo4jDatabase:
    def __init__(self):
        self._uri = "bolt://localhost:7687"
        self._driver = GraphDatabase.driver(self._uri)
        self.clear_database()

    
    def clear_database(self):
        with self._driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")

    def close(self):
        if self._driver is not None:
            self._driver.close()
    
    def create_nodes_and_relationships(self, df):
        with self._driver.session() as session:
            try:
                for index, row in df.iterrows():
                    # Publication node
                    session.run(
                        "MERGE (p:Publication {publication_id: $publication_id}) "
                        "SET p.title = $title, p.abstract = $abstract, p.doi = $doi",
                        {
                            "publication_id": row['id'],
                            "title": row['title'] if not pd.isna(row['title']) else None,
                            "abstract": row['abstract'] if not pd.isna(row['abstract']) else None,
                            "doi": row['doi'] if not pd.isna(row['doi']) else None
                        }
                    )

                    # Journal node
                    if row['journal-ref']:
                        session.run(
                            "MERGE (j:Journal {journal_id: $journal_id}) "
                            "SET j.journal_ref = $journal_ref",
                            {"journal_id": index, "journal_ref": row['journal-ref']}
                        )
                        # PUBLISHED_IN relationship
                        session.run(
                            "MATCH (p:Publication {publication_id: $publication_id}), (j:Journal {journal_id: $journal_id}) "
                            "MERGE (p)-[:PUBLISHED_IN]->(j)",
                            {"publication_id": row['id'], "journal_id": index}
                        )

            except Exception as e:
                print(f"An error occurred: {e}")

In [42]:
df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')

In [43]:
neo4j_db = Neo4jDatabase()

In [44]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Bal치zs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [45]:
neo4j_db.create_nodes_and_relationships(df.head())

In [46]:
neo4j_db.close()