In [6]:
import csv
import tqdm
import ijson
import pandas as pd
from neo4j import GraphDatabase

In [None]:
from neo4j import GraphDatabase

class HelloWorldExample:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def print_greeting(self, message):
        with self.driver.session() as session:
            greeting = session.execute_write(self._create_and_return_greeting, message)
            print(greeting)

    @staticmethod
    def _create_and_return_greeting(tx, message):
        result = tx.run("match (a) delete a", message=message)
        return result.single()[0]

In [None]:
greeter = HelloWorldExample("bolt://localhost:7687", "neo4j", "Temp/123")
greeter.print_greeting("hello, world")
greeter.close()

In [10]:
authors = []
papers = []
conferences = []
journals = []
keywords = []
cites = []

c_count = 0
j_count = 0
with open("dblp.v12.json", "rb") as f:
    for i, element in tqdm.tqdm(enumerate(ijson.items(f, "item"))):
        
        p_id = element["id"]
        p_title = element["title"]
        p_year = element.get("year")
        
        #skip paper if year of publication not available
        if not p_year:
            continue
        
        #skip paper if n_citation or page information not available
        if ("n_citation" not in element) or ("page_start" not in element) or ("page_end" not in element):
            continue
                    
        p_pages = element["page_start"] + "-" + element["page_end"]
        
        #skip paper if abstract not available
        if "indexed_abstract" not in element:
            continue
        
        p_abstract = ' '.join(list(element["indexed_abstract"]["InvertedIndex"].keys()))
             
        #skip paper if organization of author not available
        org = True
        for n in element['authors']:
            if "org" not in n:
                org = False
        if org == False:
            continue
        
        #skip paper if field of study is not available
        if "fos" not in element:
            continue
            
        #skip paper if reference is not available
        if "references" not in element:
            continue
        
        if "type" in element["venue"]:
            #for each paper get conference with title and edition
            if element["venue"]["type"] == "C":
                element["volume"] = element["volume"].strip()
                if "venue" in element and "type" in element["venue"] and element["volume"] != '' and c_count <= 9999:
                        conf_title = element["venue"]["raw"]
                        edition = int(element["volume"])
                        conf_id = element["venue"]["id"]
                        conferences.append((p_id, conf_title, edition, p_year, conf_id))
                        c_count += 1
                else:
                    #skip paper if any of conference information is missing
                    continue

            #for each paper get journal with title and volume
            elif element["venue"]["type"] == "J":
                element["volume"] = element["volume"].strip()
                if "venue" in element and "type" in element["venue"] and element["volume"] != '' and j_count <= 9999:
                        jour_title = element["venue"]["raw"]
                        edition = int(element["volume"])
                        jour_id = element["venue"]["id"]
                        journals.append((p_id, jour_title, edition, p_year, jour_id))
                        j_count += 1
                        
                else:
                    #skip paper if any of journal information is missing
                    continue
        else:
            #skip paper if venue information is missing
            continue
        
        papers.append((p_id, p_title, p_year, element["n_citation"], p_pages, p_abstract))
        authors.extend([(p_id, n['name'], n['org'], n['id']) for n in element['authors']])
        keywords.extend([(p_id, n['name']) for n in element['fos']])
        cites.extend([(n, p_id) for n in element['references']])
        
        if c_count == 10000 and j_count == 10000:
            break
            

368322it [06:16, 978.50it/s] 


In [60]:
paper_df = pd.DataFrame(papers, columns=['paper_id','title', 'year', 'n_citations', 'pages', 'abstract'])
author_df = pd.DataFrame(authors, columns=['paper_id', 'name', 'organization', 'author_id'])
conf_df = pd.DataFrame(conferences, columns=['paper_id', 'name','edition','year','conference_id'])
jour_df = pd.DataFrame(journals, columns=['paper_id', 'name','volume','year','journal_id'])
key_df = pd.DataFrame(keywords, columns=['paper_id', 'keyword'])
cite_df = pd.DataFrame(cites, columns=['paper_id', 'cited_by'])

In [59]:
paper_df.to_csv("data/10k_samples/papers.csv", index=False)
conf_df.drop_duplicates(subset=['paper_id','conference_id']).to_csv('data/10k_samples/paper_published_in_conference.csv', index=False)
jour_df.drop_duplicates(subset=['paper_id','journal_id']).to_csv('data/10k_samples/paper_published_in_journal.csv', index=False)
author_df.drop_duplicates(subset=['paper_id','author_id']).to_csv('data/10k_samples/paper_written_by_author.csv', index=False)
author_df[['author_id', 'name','organization']].drop_duplicates(subset=['author_id']).to_csv('data/10k_samples/authors.csv', index=False)
conf_df[['conference_id','name', 'edition', 'year']].drop_duplicates(subset=['conference_id']).to_csv('data/10k_samples/conference.csv', index=False)
jour_df[['journal_id','name', 'volume', 'year']].drop_duplicates(subset=['journal_id']).to_csv('data/10k_samples/journal.csv', index=False)


tkey_df = key_df[['keyword']].drop_duplicates()
tkey_df['keyword_id'] = tkey_df.index+1
tkey_df[['keyword_id','keyword']].to_csv('data/10k_samples/keywords.csv', index=False)
key_df.merge(tkey_df, on='keyword')[['paper_id','keyword_id']].to_csv('data/10k_samples/paper_has_keywords.csv', index=False)

cite_df[cite_df.paper_id.isin(cite_df.cited_by.unique())].to_csv('data/10k_samples/paper_cited_paper')

Unnamed: 0,paper_id,cited_by
166,179660676,225523
298,244957858,422477
314,49034894,531080
315,80325070,531080
327,91859925,567418
...,...,...
242540,79637624,1507329928
242644,138290785,1507380329
242689,1491688952,1507383454
242707,1491987530,1507393849
