# Analyzing the Metadata of Publications in the Open Research Knowledge Graph 
This Jupyter notebook contains different analyses on the metadata of publications stored in the Open Research Knowledge Graph [ORKG](https://www.orkg.org/orkg/).

## Loading Data from ORKG SPAQRL endpoint

In [86]:
%matplotlib widget
from SPARQLWrapper import SPARQLWrapper, CSV
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

query = """
        SELECT ?paper, ?field_label, ?DOI, ?title, ?name, ?id, ?month_number, ?year_number, ?paper_url, ?venue_label
WHERE {
  ?paper a orkgc:Paper.
  OPTIONAL{?paper rdfs:label ?title.}
  OPTIONAL{?paper orkgp:P26 ?DOI.}
  OPTIONAL{?paper orkgp:P30 ?field.
           ?field rdfs:label ?field_label.}
  OPTIONAL{?paper orkgp:P27 ?author.
           BIND(IF(isLiteral(?author), ?author, "") AS ?name1)
           OPTIONAL{?author rdfs:label ?author_label;
                            orkgp:HAS_ORCID ?id.}
           BIND(IF(BOUND(?author_label),?author_label, "") AS ?name2)
           BIND(IF(?name1 = "", ?name2, ?name1) AS ?name)
          }
  
  OPTIONAL{?paper orkgp:P28 ?month.
           BIND(IF(isLiteral(?month), ?month, "") AS ?month1)
           OPTIONAL{?month rdfs:label ?month_label.}
           BIND(IF(BOUND(?month_label),?month_label, "") AS ?month2)
           BIND(IF(?month1 = "", ?month2, ?month1) AS ?month_number)
          }
  
  OPTIONAL{?paper orkgp:P29 ?year.
           BIND(IF(isLiteral(?year), ?year, "") AS ?year1)
           OPTIONAL{?year rdfs:label ?year_label.}
           BIND(IF(BOUND(?year_label),?year_label, "") AS ?year2)
           BIND(IF(?year1 = "", ?year2, ?year1) AS ?year_number)
          }
  
  OPTIONAL{?paper orkgp:url ?paper_url.}
  OPTIONAL{?paper orkgp:HAS_VENUE ?venue.
           ?venue rdfs:label ?venue_label.}
} ORDER BY ?paper
        """

user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

#df = pd.read_csv('CLEAN-query-a_paper_and_all_metadata_optional_result_1-41364-2022.05.04.csv')
md_df = pd.read_csv('query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='utf-8', encoding_errors='ignore')

print(md_df['paper'].nunique())

9880


## Data Cleaning

In [109]:
df = pd.DataFrame()

df['paper'] = md_df['paper'].astype('str')
df['field'] = md_df['field_label'].astype('category')
df['DOI'] = md_df['DOI'].astype('str')
df['title'] = md_df['title'].astype('str')
df['author'] = md_df['name'].astype('str')
df['orcid'] = md_df['id'].astype('str')
df['month'] = md_df['month_number'].fillna(0.0).astype('int')
df['year'] = md_df['year_number'].fillna(0.0).astype('int')
df['url'] = md_df['paper_url'].astype('str')
df['venue'] = md_df['venue_label'].astype('str')

field_df = md_df[['paper','field_label']].drop_duplicates()
print(field_df.isnull().sum())

field_df.fillna('No research field')
field_df[['field_label']].value_counts().head(15)

ValueError: invalid literal for int() with base 10: 'October 2020'