In [1]:
from SPARQLWrapper import SPARQLWrapper, CSV
import pandas as pd
import doi
from datetime import datetime
import sys

In [13]:
ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

# SPARQL query from the ORKG SPARQL endpoint
# gets papers and respective contributions
query = """   
SELECT ?paper ?paper_title 
       ?url ?author
       ?venue ?venue_label
       ?doi ?publication_month ?publication_year
       ?research_field ?research_field_label
       ?contribution
  WHERE {  
    
    # comment out the BINDing line to get the metadata of all papers.
    # BIND(orkgr:R141003 AS ?paper)
    
    ?paper rdf:type orkgc:Paper;
           rdfs:label ?paper_title ;
           orkgp:P27 ?author ;
           orkgp:P30 ?research_field .
    
    ?research_field rdfs:label ?research_field_label .
    
    OPTIONAL { ?paper orkgp:P26 ?doi } .
    OPTIONAL { ?paper orkgp:url ?url } .
    OPTIONAL { ?paper orkgp:P28 ?publication_month } .
    OPTIONAL { ?paper orkgp:P29 ?publication_year } .
    OPTIONAL { ?paper orkgp:HAS_VENUE ?venue .
               ?venue rdfs:label ?venue_label } .
    OPTIONAL { ?paper orkgp:P31 ?contribution } .
}
        """

#Code from Oliver's metadata analysis notebook
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('paper_query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

In [15]:
orkg_paper_contribution_df = pd.read_csv('paper_query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='ISO-8859-1') # no utf-8 encoding ...
orkg_paper_contribution_df = orkg_paper_contribution_df.drop(columns=['paper_title', 'url', 'author', 'venue', 'venue_label', 'publication_month', 'publication_year', 'research_field'])
orkg_paper_contribution_df

  orkg_paper_contribution_df = pd.read_csv('paper_query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='ISO-8859-1') # no utf-8 encoding ...


Unnamed: 0,paper,doi,research_field_label,contribution
0,http://orkg.org/orkg/resource/R38466,10.5808/gi.2019.17.2.e14,Bioinformatics,http://orkg.org/orkg/resource/R38472
1,http://orkg.org/orkg/resource/R38466,10.5808/gi.2019.17.2.e14,Bioinformatics,http://orkg.org/orkg/resource/R38472
2,http://orkg.org/orkg/resource/R38466,10.5808/gi.2019.17.2.e14,Bioinformatics,http://orkg.org/orkg/resource/R38472
3,http://orkg.org/orkg/resource/R38466,10.5808/gi.2019.17.2.e14,Bioinformatics,http://orkg.org/orkg/resource/R38472
4,http://orkg.org/orkg/resource/R148043,10.1093/bioinformatics/bth227,Bioinformatics,http://orkg.org/orkg/resource/R148045
...,...,...,...,...
99995,http://orkg.org/orkg/resource/R537121,,Computer Sciences,http://orkg.org/orkg/resource/R537129
99996,http://orkg.org/orkg/resource/R537121,,Computer Sciences,http://orkg.org/orkg/resource/R537133
99997,http://orkg.org/orkg/resource/R537121,,Computer Sciences,http://orkg.org/orkg/resource/R537122
99998,http://orkg.org/orkg/resource/R537121,,Computer Sciences,http://orkg.org/orkg/resource/R537129


In [19]:
aggregate_funcs = {'paper': 'first', 'doi': 'first', 'research_field_label': 'first'}
orkg_paper_df = orkg_paper_contribution_df.groupby('paper', as_index=False).agg(aggregate_funcs)
orkg_paper_df

Unnamed: 0,paper,doi,research_field_label
0,http://orkg.org/orkg/resource/R1004,10.1186/s12866-016-0676-9,Bioinformatics
1,http://orkg.org/orkg/resource/R108458,,Science
2,http://orkg.org/orkg/resource/R108460,,Science
3,http://orkg.org/orkg/resource/R108462,,Science
4,http://orkg.org/orkg/resource/R108464,,Science
...,...,...,...
12988,http://orkg.org/orkg/resource/R8441,10.1016/j.jbiotec.2013.12.009,Systems Biology
12989,http://orkg.org/orkg/resource/R9154,10.1073/pnas.1913007117,Life Sciences
12990,http://orkg.org/orkg/resource/R9190,10.1007/978-3-319-54064-1_3,Biotechnology
12991,http://orkg.org/orkg/resource/R9295,10.1038/nprot.2016.182,Biotechnology


In [20]:
def check_doi(orkg_doi):
    try:
        doi.validate_doi(orkg_doi)
    except:
        return None
    return orkg_doi

orkg_paper_df['doi'] = orkg_paper_df['doi'].map(lambda x: check_doi(x))
orkg_paper_df = orkg_paper_df.dropna()
orkg_paper_df.reset_index(drop=True, inplace=True)

orkg_paper_df.to_pickle("./orkg_papers_"+now.strftime('%Y-%m-%d')+".pkl")
orkg_paper_df

Unnamed: 0,paper,doi,research_field_label
0,http://orkg.org/orkg/resource/R1004,10.1186/s12866-016-0676-9,Bioinformatics
1,http://orkg.org/orkg/resource/R108529,10.1145/3201064.3201083,Computer Sciences
2,http://orkg.org/orkg/resource/R108704,10.1094/pdis-02-15-0236-re,Plant Pathology
3,http://orkg.org/orkg/resource/R108713,10.1007/s10658-014-0409-6,Plant Pathology
4,http://orkg.org/orkg/resource/R108865,10.1186/s12859-019-3284-5,Bioinformatics
...,...,...,...
4430,http://orkg.org/orkg/resource/R8441,10.1016/j.jbiotec.2013.12.009,Systems Biology
4431,http://orkg.org/orkg/resource/R9154,10.1073/pnas.1913007117,Life Sciences
4432,http://orkg.org/orkg/resource/R9190,10.1007/978-3-319-54064-1_3,Biotechnology
4433,http://orkg.org/orkg/resource/R9295,10.1038/nprot.2016.182,Biotechnology


In [21]:
ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

# SPARQL query from the ORKG SPARQL endpoint
# gets papers and respective contributions
query = """   
SELECT ?comparisons ?doi
        WHERE {
            ?comparisons rdf:type orkgc:Comparison .   
            OPTIONAL { ?comparisons orkgp:P26 ?doi }
        }
        """

#Code from Oliver's metadata analysis notebook
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('comparison_query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

In [25]:
orkg_comparison_df = pd.read_csv('comparison_query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='ISO-8859-1') # no utf-8 encoding ...
orkg_comparison_df = orkg_comparison_df.dropna()
orkg_comparison_df.reset_index(drop=True, inplace=True)

orkg_comparison_df.to_pickle("./orkg_comparisons_"+now.strftime('%Y-%m-%d')+".pkl")
orkg_comparison_df

Unnamed: 0,comparisons,doi
0,http://orkg.org/orkg/resource/R36099,10.48366/r36099
1,http://orkg.org/orkg/resource/R38484,10.48366/r38484
2,http://orkg.org/orkg/resource/R44930,10.48366/r44930
3,http://orkg.org/orkg/resource/R70212,10.48366/r70212
4,http://orkg.org/orkg/resource/R8342,10.48366/r8342
...,...,...
282,http://orkg.org/orkg/resource/R284515,10.48366/r284515
283,http://orkg.org/orkg/resource/R285347,10.48366/r285347
284,http://orkg.org/orkg/resource/R287928,10.48366/r287928
285,http://orkg.org/orkg/resource/R287929,10.48366/r287929
