In [1]:
from SPARQLWrapper import SPARQLWrapper, CSV
import pandas as pd
import doi
from datetime import datetime
import sys

In [2]:
ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

# SPARQL query from the ORKG SPARQL endpoint
# gets papers and respective contributions
query = """   
SELECT ?paper ?paper_title 
       ?url ?author
       ?venue ?venue_label
       ?doi ?publication_month ?publication_year
       ?research_field ?research_field_label
       ?contribution
  WHERE {  
    
    # comment out the BINDing line to get the metadata of all papers.
    # BIND(orkgr:R141003 AS ?paper)
    
    ?paper rdf:type orkgc:Paper;
           rdfs:label ?paper_title ;
           orkgp:P27 ?author ;
           orkgp:P30 ?research_field .
    
    ?research_field rdfs:label ?research_field_label .
    
    OPTIONAL { ?paper orkgp:P26 ?doi } .
    OPTIONAL { ?paper orkgp:url ?url } .
    OPTIONAL { ?paper orkgp:P28 ?publication_month } .
    OPTIONAL { ?paper orkgp:P29 ?publication_year } .
    OPTIONAL { ?paper orkgp:HAS_VENUE ?venue .
               ?venue rdfs:label ?venue_label } .
    OPTIONAL { ?paper orkgp:P31 ?contribution } .
}
        """

#Code from Oliver's metadata analysis notebook
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('paper_query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

In [3]:
orkg_paper_contribution_df = pd.read_csv('paper_query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='ISO-8859-1') # no utf-8 encoding ...
orkg_paper_contribution_df = orkg_paper_contribution_df.drop(columns=['paper_title', 'url', 'author', 'venue', 'venue_label', 'publication_month', 'publication_year', 'research_field'])
orkg_paper_contribution_df

Unnamed: 0,paper,doi,research_field_label,contribution
0,http://orkg.org/orkg/resource/R141003,10.18653/v1/2021.semeval-1.6,Natural Language Processing,http://orkg.org/orkg/resource/R141005
1,http://orkg.org/orkg/resource/R141003,10.18653/v1/2021.semeval-1.6,Natural Language Processing,http://orkg.org/orkg/resource/R141005
2,http://orkg.org/orkg/resource/R141003,10.18653/v1/2021.semeval-1.6,Natural Language Processing,http://orkg.org/orkg/resource/R141005
3,http://orkg.org/orkg/resource/R141003,10.18653/v1/2021.semeval-1.6,Natural Language Processing,http://orkg.org/orkg/resource/R141005
4,http://orkg.org/orkg/resource/R141010,10.18653/v1/2021.unimplicit-1.4,Natural Language Processing,http://orkg.org/orkg/resource/R141012
...,...,...,...,...
99995,http://orkg.org/orkg/resource/R522902,,Computer Sciences,http://orkg.org/orkg/resource/R522903
99996,http://orkg.org/orkg/resource/R522902,,Computer Sciences,http://orkg.org/orkg/resource/R522903
99997,http://orkg.org/orkg/resource/R522902,,Computer Sciences,http://orkg.org/orkg/resource/R522903
99998,http://orkg.org/orkg/resource/R522902,,Computer Sciences,http://orkg.org/orkg/resource/R522903


In [4]:
aggregate_funcs = {'paper': 'first', 'doi': 'first', 'research_field_label': 'first'}
orkg_paper_df = orkg_paper_contribution_df.groupby('paper', as_index=False).agg(aggregate_funcs)
orkg_paper_df

Unnamed: 0,paper,doi,research_field_label
0,http://orkg.org/orkg/resource/R1004,10.1186/s12866-016-0676-9,Bioinformatics
1,http://orkg.org/orkg/resource/R107618,10.1007/s11423-020-09885-z,Learner-Interface Interaction
2,http://orkg.org/orkg/resource/R107663,10.1111/1467-8535.00213,Learner-Interface Interaction
3,http://orkg.org/orkg/resource/R107834,10.32870/Ap.v12n2.1910,Learner-Interface Interaction
4,http://orkg.org/orkg/resource/R107843,10.19173/irrodl.v4i2.149,Learner-Interface Interaction
...,...,...,...
12992,http://orkg.org/orkg/resource/R8441,10.1016/j.jbiotec.2013.12.009,Systems Biology
12993,http://orkg.org/orkg/resource/R9154,10.1073/pnas.1913007117,Life Sciences
12994,http://orkg.org/orkg/resource/R9190,10.1007/978-3-319-54064-1_3,Biotechnology
12995,http://orkg.org/orkg/resource/R9295,10.1038/nprot.2016.182,Biotechnology


In [5]:
def check_doi(orkg_doi):
    try:
        doi.validate_doi(orkg_doi)
    except:
        return None
    return orkg_doi

In [6]:
orkg_paper_df['doi'] = orkg_paper_df['doi'].map(lambda x: check_doi(x))
orkg_paper_df = orkg_paper_df.dropna()
orkg_paper_df.reset_index(drop=True, inplace=True)

orkg_paper_df.to_pickle("./orkg_papers_doi_"+now.strftime('%Y-%m-%d')+".pkl")
orkg_paper_df

Unnamed: 0,paper,doi,research_field_label
0,http://orkg.org/orkg/resource/R1004,10.1186/s12866-016-0676-9,Bioinformatics
1,http://orkg.org/orkg/resource/R107618,10.1007/s11423-020-09885-z,Learner-Interface Interaction
2,http://orkg.org/orkg/resource/R107663,10.1111/1467-8535.00213,Learner-Interface Interaction
3,http://orkg.org/orkg/resource/R107834,10.32870/Ap.v12n2.1910,Learner-Interface Interaction
4,http://orkg.org/orkg/resource/R107843,10.19173/irrodl.v4i2.149,Learner-Interface Interaction
...,...,...,...
4944,http://orkg.org/orkg/resource/R8441,10.1016/j.jbiotec.2013.12.009,Systems Biology
4945,http://orkg.org/orkg/resource/R9154,10.1073/pnas.1913007117,Life Sciences
4946,http://orkg.org/orkg/resource/R9190,10.1007/978-3-319-54064-1_3,Biotechnology
4947,http://orkg.org/orkg/resource/R9295,10.1038/nprot.2016.182,Biotechnology


In [7]:
ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

# SPARQL query from the ORKG SPARQL endpoint
# gets papers and respective contributions
query = """   
SELECT ?comparisons ?doi
        WHERE {
            ?comparisons rdf:type orkgc:Comparison .   
            OPTIONAL { ?comparisons orkgp:P26 ?doi }
        }
        """

#Code from Oliver's metadata analysis notebook
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('comparison_query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

In [8]:
orkg_comparison_df = pd.read_csv('comparison_query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='ISO-8859-1') # no utf-8 encoding ...
orkg_comparison_df['doi'] = orkg_comparison_df['doi'].map(lambda x: check_doi(x))
orkg_comparison_df = orkg_comparison_df.dropna()
orkg_comparison_df.reset_index(drop=True, inplace=True)

orkg_comparison_df.to_pickle("./orkg_comparisons_"+now.strftime('%Y-%m-%d')+".pkl")
orkg_comparison_df

Unnamed: 0,comparisons,doi
0,http://orkg.org/orkg/resource/R140347,10.48366/r140347
1,http://orkg.org/orkg/resource/R140449,10.48366/r140449
2,http://orkg.org/orkg/resource/R140463,10.48366/r140463
3,http://orkg.org/orkg/resource/R140465,10.48366/r140465
4,http://orkg.org/orkg/resource/R140714,10.48366/r140714
...,...,...
289,http://orkg.org/orkg/resource/R576872,10.48366/r576872
290,http://orkg.org/orkg/resource/R576873,10.48366/r576873
291,http://orkg.org/orkg/resource/R576874,10.48366/r576874
292,http://orkg.org/orkg/resource/R576875,10.48366/r576875
