# Analyzing the Metadata of Publications in the Open Research Knowledge Graph 
This Jupyter notebook contains different analyses on the metadata of publications stored in the Open Research Knowledge Graph [ORKG](https://www.orkg.org/orkg/).

## Loading Data from the ORKG SPAQRL endpoint

In [225]:
%matplotlib widget
from SPARQLWrapper import SPARQLWrapper, CSV
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

query = """
        SELECT ?paper, ?research_field, ?doi, ?title, ?author, ?orcid, ?month, ?year, ?url, ?venue
WHERE {
  ?paper a orkgc:Paper.
  OPTIONAL{?paper rdfs:label ?title.}
  OPTIONAL{?paper orkgp:P26 ?doi.}
  OPTIONAL{?paper orkgp:P30 ?field.
           ?field rdfs:label ?research_field.}
  OPTIONAL{?paper orkgp:P27 ?author_resrc.
           BIND(IF(isLiteral(?author_resrc), ?author_resrc, "") AS ?name1)
           OPTIONAL{?author_resrc rdfs:label ?author_label;
                            orkgp:HAS_ORCID ?orcid.}
           BIND(IF(BOUND(?author_label),?author_label, "") AS ?name2)
           BIND(IF(?name1 = "", ?name2, ?name1) AS ?author)
          }
  
  OPTIONAL{?paper orkgp:P28 ?month_resrc.
           BIND(IF(isLiteral(?month_resrc), ?month_resrc, "") AS ?month1)
           OPTIONAL{?month_resrc rdfs:label ?month_label.}
           BIND(IF(BOUND(?month_label),?month_label, "") AS ?month2)
           BIND(IF(?month1 = "", ?month2, ?month1) AS ?month)
          }
  
  OPTIONAL{?paper orkgp:P29 ?year_resrc.
           BIND(IF(isLiteral(?year_resrc), ?year_resrc, "") AS ?year1)
           OPTIONAL{?year_resrc rdfs:label ?year_label.}
           BIND(IF(BOUND(?year_label),?year_label, "") AS ?year2)
           BIND(IF(?year1 = "", ?year2, ?year1) AS ?year)
          }
  
  OPTIONAL{?paper orkgp:url ?url.}
  OPTIONAL{?paper orkgp:HAS_VENUE ?venue_resrc.
           ?venue_resrc rdfs:label ?venue.}
} ORDER BY ?paper
        """

user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

## Data Validation
1. Reading the data and checking the shape and column names.

In [236]:
df = pd.read_csv('query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='utf-8', encoding_errors='ignore')

print('Shape of the dataframe: '+ str(df.shape))
print('Column names: ' + str(df.columns))

Shape of the dataframe: (41364, 10)
Column names: Index(['paper', 'research_field', 'doi', 'title', 'author', 'orcid', 'month',
       'year', 'url', 'venue'],
      dtype='object')


2. Checking for NaN values in each column for the entire dataframe.

In [276]:
pd.set_option("max_rows", None)
for column in df:
    print(df[column].isna().value_counts(dropna=False))

False    41364
Name: paper, dtype: int64
False    41176
True       188
Name: research_field, dtype: int64
False    34910
True      6454
Name: doi, dtype: int64
False    41354
True        10
Name: title, dtype: int64
False    40974
True       390
Name: author, dtype: int64
True     38415
False     2949
Name: orcid, dtype: int64
False    31442
True      9922
Name: month, dtype: int64
False    39601
True      1763
Name: year, dtype: int64
True     33192
False     8172
Name: url, dtype: int64
False    25966
True     15398
Name: venue, dtype: int64


3. Derterming the number of NaN entries of the column **paper** and the number of unique papers.

In [296]:
#print(df.drop_duplicates(subset='paper')['paper'].value_counts(dropna=False).sum())
print('Number of NaN entries: ' + str(df['paper'].isna().sum()))
print('Number of unique papers: '+ str(df['paper'].nunique()))

Number of NaN entries: 0
Number of unique papers: 9880


4. Checking the number of NaN entries for the column **research field** and the number of unique papers without a research field.

In [278]:
df_unique_papers = df.drop_duplicates(subset='paper')
print(df['research_field'].value_counts(dropna=False))
print('Number of unique papers without a research field: ' + str(df_unique_papers['research_field'].isna().sum()))

Science                                                                                       9907
Bioinformatics                                                                                7232
Ecology and Evolutionary Biology                                                              2781
Information Science                                                                           1379
Medicinal Chemistry and Pharmaceutics                                                         1243
Virology                                                                                      1150
Artificial Intelligence                                                                       1055
Toxicology                                                                                    1041
Computer Sciences                                                                             1031
Natural Language Processing                                                                    966
Oceanograp

5. Checking the number of NaN entries for the column **doi** and the number of unique papers without a research field.

In [293]:
#print(df['doi'].value_counts(dropna=False))
wrongDOIs = df[~df['doi'].str.startswith('10.', na=False)]
print(wrongDOIs['doi'].value_counts(dropna=False))
print('Number of unique papers without a DOI: ' + str(df_unique_papers['doi'].isna().sum()))

NaN                                                   6454
https://doi.org/10.22499/2.6301.004                     28
https://doi.org/10.1016/j.jbusres.2011.08.018           20
 10.1128/JVI.00837-18                                   18
0.1016/j.ebiom.2020.102743                              13
http://orkg.org/orkg/resource/R109808                    8
https://doi.org/10.1016/j.fusengdes.2020.111504          8
https://doi.org/10.1016/j.procs.2016.09.123              7
doi.org/10.1016/j.jbi.2013.09.008                        7
https://pubmed.ncbi.nlm.nih.gov/29480735/                6
https://dblp.org/rec/conf/ijcai/TangLLCG09.bib           5
https://doi.org/10.1162/dint_a_00031                     5
doi.org/10.1016/j.artint.2012.03.006                     5
https://doi.org/10.1145/2872518.2889386                  4
https://doi.org/10.1145/2814864.2814887                  4
https://doi.org/10.1145/1255175.1255193                  4
https://dblp.org/rec/conf/semweb/CardilloFTG14.bib      

In [256]:
duplicated_title = df.drop_duplicates(subset=['paper'])
print(duplicated_title['title'].value_counts(dropna=False))
print('Number of NaN entries: ' + str(df['title'].isna().sum()))

Governing nonprofit platform ecosystems – an information platform for refugees                                                                                                                                                                                                                                                                                                                                                        14
Linking sea level rise and socioeconomic indicators under the Shared Socioeconomic Pathways                                                                                                                                                                                                                                                                                                                                           14
Creating the European Literary Text Collection (ELTeC): Challenges and Perspectives                                                                   

In [None]:
print(md_df['month_number'].value_counts(dropna=False))

In [None]:
print(md_df['year_number'].value_counts(dropna=False))

In [None]:
field_df = md_df[['paper','field_label']].drop_duplicates()

pd.set_option("max_rows", None)
field_df[['field_label']].value_counts(dropna=False)

sns.countplot(y='field_label', data=field_df)

In [None]:
df = pd.DataFrame()
nsfg['nbrnaliv'].replace([98,99], np.nan, inplace=True)

df['paper'] = md_df['paper'].astype('str')
df['field'] = md_df['field_label'].astype('category')
df['DOI'] = md_df['DOI'].astype('str')
df['title'] = md_df['title'].astype('str')
df['author'] = md_df['name'].astype('str')
df['orcid'] = md_df['id'].astype('str')
df['month'] = md_df['month_number'].fillna(0.0).astype('int')
df['year'] = md_df['year_number'].fillna(0.0).astype('int')
df['url'] = md_df['paper_url'].astype('str')
df['venue'] = md_df['venue_label'].astype('str')