# Analyzing the Metadata of Publications in the Open Research Knowledge Graph 
This Jupyter notebook contains different analyses on the metadata of publications stored in the Open Research Knowledge Graph [ORKG](https://www.orkg.org/orkg/).

## Loading Data from the ORKG SPAQRL endpoint

In [3]:
%matplotlib widget
from SPARQLWrapper import SPARQLWrapper, CSV
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

ENDPOINT_URL = "https://www.orkg.org/orkg/triplestore"

PREFIXES =  """
            PREFIX orkgr: <http://orkg.org/orkg/resource/>
            PREFIX orkgc: <http://orkg.org/orkg/class/>
            PREFIX orkgp: <http://orkg.org/orkg/predicate/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            """

query = """
        SELECT ?paper, ?research_field, ?doi, ?title, ?author, ?orcid, ?month, ?year, ?url, ?venue
WHERE {
  ?paper a orkgc:Paper.
  OPTIONAL{?paper rdfs:label ?title.}
  OPTIONAL{?paper orkgp:P26 ?doi.}
  OPTIONAL{?paper orkgp:P30 ?field.
           ?field rdfs:label ?research_field.}
  OPTIONAL{?paper orkgp:P27 ?author_resrc.
           BIND(IF(isLiteral(?author_resrc), ?author_resrc, "") AS ?name1)
           OPTIONAL{?author_resrc rdfs:label ?author_label;
                            orkgp:HAS_ORCID ?orcid.}
           BIND(IF(BOUND(?author_label),?author_label, "") AS ?name2)
           BIND(IF(?name1 = "", ?name2, ?name1) AS ?author)
          }
  
  OPTIONAL{?paper orkgp:P28 ?month_resrc.
           BIND(IF(isLiteral(?month_resrc), ?month_resrc, "") AS ?month1)
           OPTIONAL{?month_resrc rdfs:label ?month_label.}
           BIND(IF(BOUND(?month_label),?month_label, "") AS ?month2)
           BIND(IF(?month1 = "", ?month2, ?month1) AS ?month)
          }
  
  OPTIONAL{?paper orkgp:P29 ?year_resrc.
           BIND(IF(isLiteral(?year_resrc), ?year_resrc, "") AS ?year1)
           OPTIONAL{?year_resrc rdfs:label ?year_label.}
           BIND(IF(BOUND(?year_label),?year_label, "") AS ?year2)
           BIND(IF(?year1 = "", ?year2, ?year1) AS ?year)
          }
  
  OPTIONAL{?paper orkgp:url ?url.}
  OPTIONAL{?paper orkgp:HAS_VENUE ?venue_resrc.
           ?venue_resrc rdfs:label ?venue.}
} ORDER BY ?paper
        """

user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])

sparql = SPARQLWrapper(ENDPOINT_URL, agent=user_agent)
sparql.setQuery(PREFIXES+query)
sparql.setReturnFormat(CSV)

try:
        results = sparql.queryAndConvert()
except Exception as e:
        print(e)

now = datetime.now()
with open('query_result_' + now.strftime('%Y-%m-%d') + '.csv', 'wb') as file:
        file.write(results)

## Initial Data Validation and Check
1. Reading the data and checking the shape and column names.

In [4]:
df = pd.read_csv('query_result_' + now.strftime('%Y-%m-%d') + '.csv', encoding='utf-8', encoding_errors='ignore')

print('Shape of the dataframe: '+ str(df.shape))
print('Column names: ' + str(df.columns))

Shape of the dataframe: (64260, 10)
Column names: Index(['paper', 'research_field', 'doi', 'title', 'author', 'orcid', 'month',
       'year', 'url', 'venue'],
      dtype='object')


2. Checking for NaN values in each column for the entire dataframe.

In [5]:
#pd.set_option("max_rows", None)
for column in df:
    print(df[column].isna().value_counts(dropna=False).to_frame())

       paper
False  64260
       research_field
False           63809
True              451
         doi
False  54981
True    9279
       title
False  64250
True      10
       author
False   63766
True      494
       orcid
True   59691
False   4569
       month
False  49030
True   15230
        year
False  62051
True    2209
         url
True   47410
False  16850
       venue
False  48074
True   16186


3. Derterming the number of NaN entries of the column **paper** and the number of unique papers.

In [6]:
#print(df.drop_duplicates(subset='paper')['paper'].value_counts(dropna=False).sum())
print('Number of NaN entries: ' + str(df['paper'].isna().sum()))
print('Number of unique papers: '+ str(df['paper'].nunique()))

Number of NaN entries: 0
Number of unique papers: 13995


4. Checking the number of unique papers per **research field** and the number of unique papers without a research field.

In [7]:
df_unique_papers = df.drop_duplicates(subset='paper')
print('Number of unique papers without a research field: ' + str(df_unique_papers['research_field'].isna().sum()))
df_unique_papers['research_field'].value_counts().to_frame().head(15)

Number of unique papers without a research field: 181


Unnamed: 0,research_field
Science,3141
Bioinformatics,1195
Ecology and Evolutionary Biology,976
Medicine,786
Artificial Intelligence,414
Software Engineering,410
Information Science,408
Natural Language Processing,387
Virology,361
"Operations Research, Systems Engineering and Industrial Engineering",346


5. Checking the number of incorrect entries for the column **doi** and the number of unique papers without a DOI.

In [8]:
incorrect_DOIs = df_unique_papers[~df_unique_papers['doi'].str.startswith('10', na=False)]
print('Number of incorrect DOIs :' + str(incorrect_DOIs['doi'].value_counts().sum()))
print(incorrect_DOIs['doi'].value_counts().to_frame().head(15))
print('Number of unique papers without a DOI: ' + str(df_unique_papers['doi'].isna().sum()))

Number of incorrect DOIs :850
                                              doi
https://doi.org/10.1126/science.aad5177         2
https://doi.org/10.1016/j.eswa.2019.05.052      1
https://doi.org/10.1038/sj.gt.3302887           1
https://doi.org/10.1177/0272989x19883631        1
https://doi.org/10.3390/molecules20046237       1
https://doi.org/10.1177/0300060515613223        1
https://doi.org/10.1186/s13395-017-0139-5       1
https://doi.org/10.1212/wnl.0000000000004570    1
https://doi.org/10.1089/hum.2013.210            1
https://doi.org/10.1096/fj.201802488r           1
https://doi.org/10.1016/j.nmd.2017.10.004       1
https://doi.org/10.1016/j.nmd.2017.06.557       1
https://doi.org/10.1016/j.nmd.2012.05.002       1
https://doi.org/10.1186/s13395-019-0207-0       1
https://doi.org/10.1038/nbt.4148                1
Number of unique papers without a DOI: 2986


6. Checking the number of unique papers with a specific **title** and the number of unique papers without a title.

In [9]:
print('Number of unique papers without a title: ' + str(df_unique_papers['title'].isna().sum()))
df_unique_papers['title'].value_counts().loc[lambda x : x >= 2].to_frame()

Number of unique papers without a title: 8


Unnamed: 0,title
An Intrusion Detection Model for Wireless Sensor Networks With an Improved V-Detector Algorithm,36
Governing nonprofit platform ecosystems – an information platform for refugees,14
Absolute measurement of the resonance lines in heliumlike vanadium on an electron-beam ion trap,11
A Two-Layer Dimension Reduction and Two-Tier Classification Model for Anomaly-Based Intrusion Detection in IoT Backbone Networks,9
Creating the European Literary Text Collection (ELTeC): Challenges and Perspectives,9
...,...
Stark broadening of spectral lines along the isoelectronic sequence of Li,2
"A semi-automated, KNIME-based workflow for biofilm assays",2
Stark broadening of resonance transitions in B III,2
دور الحكومة الإلكترونية في الحد من ظاهرة الفساد الإداري دراسة استطلاعية لآراء العاملين على الخدمات الإلكترونية في الإدارات العامة في لبنان,2


In [10]:
print('Number of unique papers without a month: ' + str(df_unique_papers['month'].isna().sum()))
print(df_unique_papers['month'].value_counts())

Number of unique papers without a month: 4312
1               1867
9                853
10               806
7                802
8                797
5                704
6                696
12               666
3                663
4                646
11               610
2                571
October 2020       1
0                  1
Name: month, dtype: int64


In [11]:
print(md_df['year_number'].value_counts(dropna=False))

NameError: name 'md_df' is not defined

In [None]:
field_df = md_df[['paper','field_label']].drop_duplicates()

pd.set_option("max_rows", None)
field_df[['field_label']].value_counts(dropna=False)

sns.countplot(y='field_label', data=field_df)

In [None]:
df = pd.DataFrame()
nsfg['nbrnaliv'].replace([98,99], np.nan, inplace=True)

df['paper'] = md_df['paper'].astype('str')
df['field'] = md_df['field_label'].astype('category')
df['DOI'] = md_df['DOI'].astype('str')
df['title'] = md_df['title'].astype('str')
df['author'] = md_df['name'].astype('str')
df['orcid'] = md_df['id'].astype('str')
df['month'] = md_df['month_number'].fillna(0.0).astype('int')
df['year'] = md_df['year_number'].fillna(0.0).astype('int')
df['url'] = md_df['paper_url'].astype('str')
df['venue'] = md_df['venue_label'].astype('str')