In [7]:
import csv
import tqdm
import ijson
import pandas as pd

In [None]:
authors = []
papers = []
conferences = []
journals = []
keywords = []
cites = []

with open("dblp.v12.json", "rb") as f:

    #iterate over each paper
    for i, element in tqdm.tqdm(enumerate(ijson.items(f, "item"))):
        try:
            p_id = element["id"]
            p_title = element["title"]
            p_year = element.get("year")
            
            conf_title = ""
            jour_title = ""
            conf_id = ""
            jour_id = ""
            edition = 0

            #skip paper if year of publication not available
            if not p_year:
                continue

            #skip paper if n_citation or page information not available
            if ("n_citation" not in element) or ("page_start" not in element) or ("page_end" not in element):
                continue

            p_pages = element["page_start"] + "-" + element["page_end"]

            #skip paper if abstract not available
            if "indexed_abstract" not in element:
                continue

            p_abstract = ' '.join(list(element["indexed_abstract"]["InvertedIndex"].keys()))

            #skip paper if authors not given
            if "authors" not in element:
                continue

            #skip paper if organization of author not available
            org = True
            for n in element['authors']:
                if "org" not in n:
                    org = False
            if org == False:
                continue

            #skip paper if field of study is not available
            if "fos" not in element:
                continue

            #skip paper if reference is not available
            if "references" not in element:
                continue

            if "type" in element["venue"]:
                #for each paper get conference with title and edition
                if element["venue"]["type"] == "C":
                    element["volume"] = element["volume"].strip()
                    if "venue" in element and "type" in element["venue"] and element["volume"] != '':
                            conf_title = element["venue"]["raw"]
                            edition = int(element["volume"])
                            conf_id = element["venue"]["id"]          
                    else:
                        #skip paper if any of conference information is missing
                        continue

                #for each paper get journal with title and volume
                elif element["venue"]["type"] == "J":
                    element["volume"] = element["volume"].strip()
                    if "venue" in element and "type" in element["venue"] and element["volume"] != '':
                            jour_title = element["venue"]["raw"]
                            edition = int(element["volume"])
                            jour_id = element["venue"]["id"]
                    else:
                        #skip paper if any of journal information is missing
                        continue
            else:
                #skip paper if venue information is missing
                continue

            papers.append((p_id, p_title, p_year, element["n_citation"], p_pages, p_abstract))
            authors.extend([(p_id, n['name'], n['org'], n['id']) for n in element['authors']])
            keywords.extend([(p_id, n['name']) for n in element['fos']])
            cites.extend([(n, p_id) for n in element['references']])
            conferences.append((p_id, conf_title, edition, p_year, conf_id))
            journals.append((p_id, jour_title, edition, p_year, jour_id))
        
        except:
            continue
            

14664it [00:12, 1199.67it/s]

In [None]:
#make dataframes
paper_df = pd.DataFrame(papers, columns=['paper_id','title', 'year', 'n_citations', 'pages', 'abstract'])
author_df = pd.DataFrame(authors, columns=['paper_id', 'name', 'organization', 'author_id'])
conf_df = pd.DataFrame(conferences, columns=['paper_id', 'name','edition','year','conference_id'])
jour_df = pd.DataFrame(journals, columns=['paper_id', 'name','volume','year','journal_id'])
key_df = pd.DataFrame(keywords, columns=['paper_id', 'keyword'])
cite_df = pd.DataFrame(cites, columns=['paper_id', 'cited_by'])

In [None]:
#process dataframes to make required csv files
paper_df.to_csv("data/papers.csv", index=False)
conf_df.drop_duplicates(subset=['paper_id','conference_id']).to_csv('data/paper_published_in_conference.csv', index=False)
jour_df.drop_duplicates(subset=['paper_id','journal_id']).to_csv('data/paper_published_in_journal.csv', index=False)
author_df.drop_duplicates(subset=['paper_id','author_id']).to_csv('data/paper_written_by_author.csv', index=False)
author_df[['author_id', 'name','organization']].drop_duplicates(subset=['author_id']).to_csv('data/authors.csv', index=False)
conf_df[['conference_id','name', 'edition', 'year']].drop_duplicates(subset=['conference_id']).to_csv('data/conference.csv', index=False)
jour_df[['journal_id','name', 'volume', 'year']].drop_duplicates(subset=['journal_id']).to_csv('data/journal.csv', index=False)

In [None]:
tkey_df = key_df[['keyword']].drop_duplicates()
tkey_df['keyword_id'] = tkey_df.index+1
tkey_df[['keyword_id','keyword']].to_csv('data/keywords.csv', index=False)
key_df.merge(tkey_df, on='keyword')[['paper_id','keyword_id']].to_csv('data/paper_has_keywords.csv', index=False)

In [None]:
cite_df[cite_df.paper_id.isin(cite_df.cited_by.unique())].to_csv('data/paper_cited_paper.csv', index=False)