In [2]:
import os
import re
import glob
import tempfile
from zipfile import ZipFile

import polars as pl
from rapidfuzz import process, fuzz
from pathlib import Path
from tqdm import tqdm
import multiprocessing

from functools import lru_cache
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()


pl.Config.set_fmt_str_lengths(100);



### IRIS DFs

In [4]:
df_iris_master = pl.read_csv('./data/iris-data-2024-03-14/ODS_L1_IR_ITEM_MASTER_ALL.csv', columns=['ITEM_ID', 'YEAR_PUBLISHED', 'TITLE'] ,dtypes={'ITEM_ID': pl.Int32, 'YEAR_PUBLISHED': pl.Utf8, 'TITLE': pl.Utf8})
df_iris_identifier = pl.read_csv('./data/iris-data-2024-03-14/ODS_L1_IR_ITEM_IDENTIFIER.csv', columns=['ITEM_ID', 'IDE_DOI', 'IDE_ISBN', 'IDE_PMID'] ,dtypes={'ITEM_ID': pl.Int32, 'IDE_DOI': pl.Utf8, 'IDE_ISBN': pl.Utf8, 'IDE_PMID': pl.Utf8})

df = df_iris_identifier.join(df_iris_master, on='ITEM_ID', how='inner')


df_filtered = df.filter(pl.col('IDE_DOI').is_not_null() | pl.col('IDE_ISBN').is_not_null() | pl.col('IDE_PMID').is_not_null())[
    ['ITEM_ID', 'IDE_DOI', 'IDE_ISBN', 'IDE_PMID']
]

df_filtered.head()

ITEM_ID,IDE_DOI,IDE_ISBN,IDE_PMID
i32,str,str,str
2766,"""10.14411/fp.2008.015""",,"""18666414"""
3349,"""10.1016/j.econlet.2011.12.096""",,
3867,,"""9788876655746""",
4342,,"""8821724038""",
4585,,"""9788861328112""",


---
### Sanitize IRIS DOIs and ISBNs

In [99]:
dois = df_filtered.select('ITEM_ID', 'IDE_DOI').drop_nulls('IDE_DOI').unique()

filtered_dois = (
    dois
    .with_columns(('doi:'+pl.col('IDE_DOI').str.extract(r'(10\.\d{4,}\/[^,\s;]*)').str.to_lowercase()).alias('id'))
    .drop_nulls('id')
    .drop('IDE_DOI')
    .rename({'ITEM_ID': 'iris_id'})
)

filtered_dois

iris_id,id
i32,str
62195,"""doi:10.1111/j.1743-3150.2004.00491.x"""
210842,"""doi:10.1177/0363546514538958"""
359878,"""doi:10.1093/aob/mcy043"""
173788,"""doi:10.1039/c3cp50891a"""
70450,"""doi:10.1379/csc-98r1.1"""
…,…
190048,"""doi:10.1189/jlb.3hi0713-406rr"""
326688,"""doi:10.1016/j.sleep.2017.08.021"""
689691,"""doi:10.1016/s0027-5107(03)00057-5"""
418216,"""doi:10.1080/15548627.2018.1507438"""


In [100]:
isbns = df_filtered.select('ITEM_ID', 'IDE_ISBN').drop_nulls('IDE_ISBN').unique()

filtered_isbns = (
    isbns
    .with_columns(
        ('isbn:'+pl.col('IDE_ISBN').str.extract_all(r'(ISBN[-]*(1[03])*[ ]*(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})').list.first().str.replace_all(r'[- ]', '').str.to_lowercase()).alias('id')
        #('doi:'+pl.col('IDE_DOI')).alias('id')
    )
    .drop_nulls('id')
    .drop('IDE_ISBN')
    .rename({'ITEM_ID': 'iris_id'})
)

filtered_isbns

iris_id,id
i32,str
121474,"""isbn:9788878472525"""
136016,"""isbn:9788890389528"""
122246,"""isbn:1579583903"""
92488,"""isbn:9788890296505"""
270570,"""isbn:9781450337397"""
…,…
154792,"""isbn:9789066051393"""
638939,"""isbn:9788815294227"""
167708,"""isbn:9788860450937"""
767135,"""isbn:9783110781168"""


In [101]:
pmids = df_filtered.select('ITEM_ID', 'IDE_PMID').drop_nulls('IDE_PMID').unique()

filtered_pmids = (
    pmids
    .filter(
        ~pl.col('IDE_PMID').str.contains('PMC')
        )
    .with_columns(('pmid:'+pl.col('IDE_PMID').str.extract(r'0*([1-9][0-9]{1,8})', 1).str.to_lowercase()).alias('id'))
    .drop('IDE_PMID')
    .rename({'ITEM_ID': 'iris_id'})
)

filtered_pmids

iris_id,id
i32,str
614586,"""pmid:23271369"""
331414,"""pmid:29249005"""
242031,"""pmid:27072626"""
725023,"""pmid:34818229"""
236215,"""pmid:25693761"""
…,…
117299,"""pmid:19523975"""
324350,"""pmid:29285324"""
734503,"""pmid:8779139"""
142552,"""pmid:20724077"""


##### old

In [13]:
dois = df_filtered.select('IDE_DOI').drop_nulls().unique()['IDE_DOI'].to_list()
print(len(dois), 'unique dois')
dois_q = df_filtered.select('ITEM_ID', 'IDE_DOI').drop_nulls('IDE_DOI').filter(pl.col('IDE_DOI').str.starts_with('10.1007/jhep02(2013)043'))

#filter and normalize the dois
doi_rule = re.compile(r'10\.\d{4,}\/[^,\s;]*')
not_doi = []
filtered_dois = []

for doi in dois:
    match = doi_rule.search(doi)
    if match:
        filtered_dois.append('doi:' + match.group())
    else:
        not_doi.append(doi)

print(len(filtered_dois), 'unique dois after filtering. ', len(not_doi), 'not dois discarded')

print('---'*10)

isbns = df_filtered.select('IDE_ISBN').drop_nulls().unique()['IDE_ISBN'].to_list()
print(len(isbns), 'unique isbns')

#filter and normalize the isbns
isbn_rule = re.compile(r'(ISBN[-]*(1[03])*[ ]*(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})') # ??? results to check
not_isbn = []
filtered_isbns = []

for isbn in isbns:
    if isbn_rule.search(isbn) is not None:
        filtered_isbns.append('isbn:' + isbn.replace('-', '').replace(' ', ''))
    else:
        not_isbn.append(isbn)

print(len(filtered_isbns), 'unique isbns after filtering. ', len(not_isbn), 'not isbns discarded')

print('---'*10)

#filter and normalize the pmids
pmids = df_filtered.select('IDE_PMID').drop_nulls().unique()['IDE_PMID'].to_list()
pmid_rule = re.compile(r'(?!0\d{0,7})\d{1,8}')
not_pmids = []
filtered_pmids = []

for pmid in pmids:
    match = pmid_rule.search(pmid)
    if pmid_rule.search(pmid) is not None and 'PMC' not in pmid: # questo da sottolineare in documentazione
        filtered_pmids.append('pmid:' + match.group())
    else:
        not_pmids.append(pmid)

print(len(pmids), 'unique pmids')
print(len(filtered_pmids), 'unique pmids after filtering. ', len(not_pmids), 'not pmids discarded')

dois_isbns_pmids = filtered_dois + filtered_isbns + filtered_pmids
dois_isbns_pmids = [id.lower() for id in dois_isbns_pmids]

print('==='*10)
print(len(dois_isbns_pmids), 'total identifiers')

130077 unique dois
129954 unique dois after filtering.  123 not dois discarded
------------------------------
49982 unique isbns
49405 unique isbns after filtering.  577 not isbns discarded
------------------------------
45687 unique pmids
45683 unique pmids after filtering.  4 not pmids discarded
225042 total identifiers


In [54]:
'doi:10.1002/widm.1511' in s
jhep02(2013)043

True

---
### Meta to Parquet

In [94]:
meta_path = Path('data/csv_openalex.zip')

In [104]:
def process_meta_zip(zip_path):
    zip_file = ZipFile(zip_path)
    files_list = [zipfile for zipfile in zip_file.namelist() if zipfile.endswith('.csv')]
    output_dir = "data/iris_in_meta_join"

    dois_isbns_pmids_lf = dois_isbns_pmids_df.lazy()

    for csv_file in tqdm(files_list):
        with zip_file.open(csv_file, 'r') as file:
            with tempfile.NamedTemporaryFile() as tf:
                tf.write(file.read()) # Write the csv file to the temporary file
                tf.seek(0)          # Start at the beginning of the temporary file
                os.makedirs(output_dir, exist_ok=True)
                df = (
                    pl.scan_csv(tf.name)
                    .select(['id', 'title', 'type'])
                    .with_columns(
                        #(pl.col('id').str.extract(r"((?:isbn):[^\s]+)")).alias('isbn'),
                        (pl.col('id').str.extract(r"(omid:[^\s]+)")).alias('omid'),
                        # if it has a doi we get the doi, otherwise we get the isbn - the right approach?
                        (pl.col('id').str.extract(r"((?:doi):[^\s\"]+)")).alias('doi'),
                        (pl.col('id').str.extract(r"((?:pmid):[^\s\"]+)")).alias('pmid'),
                        (pl.col('id').str.extract(r"((?:isbn):[^\s\"]+)")).alias('isbn'),
                    )
                    .with_columns(
                        pl.coalesce([pl.col('doi'), pl.col('isbn'), pl.col('pmid')]).alias('id')
                    )
                    .drop(['doi', 'pmid', 'isbn'])
                    .drop_nulls('id')
                    .join(dois_isbns_pmids_lf, on='id', how='inner')
                    .collect(streaming=True)
                )

                if not df.is_empty():
                    df.write_parquet(os.path.join(output_dir, os.path.basename(csv_file).replace('.csv', '.parquet')))


process_meta_zip(meta_path) # 35mins

100%|██████████| 28248/28248 [13:14<00:00, 35.55it/s]


### Title similarity

#### sparql

In [5]:
iris_noid_titles = (
    df
    .select('ITEM_ID', 'IDE_DOI', 'IDE_ISBN', 'IDE_PMID', 'TITLE')
    .filter(
        (pl.col('IDE_DOI').is_null() & pl.col('IDE_ISBN').is_null() & pl.col('IDE_PMID').is_null()),
    )
    .drop('IDE_DOI', 'IDE_ISBN', 'IDE_PMID')
)#['TITLE'].to_list()

len(iris_noid_titles)
#iris_noid_titles_clean = [tile.replace('"', "'") for title in iris_noid_titles if title is not None and len(title.split()) > 3]
#len(iris_noid_titles_clean)
iris_noid_titles

ITEM_ID,TITLE
i32,str
4837,"""Mutational spectrum of SHOX gene in 25 Italian pediatric patients with Lèri-Weill dyschondrosteosis…"
5279,"""Operational Fiscal and Monetary Policy with Staggered Wage and Price Dynamics"""
6196,"""Olivier Agard – Kracauer: le chiffonier mélancolique"""
5444,"""At the time of diagnosis, Ph cells from both chronic phase chronic myeloid leukemia and acute lymph…"
6457,"""A strategy to reduce the count of moment conditions in panel data GMM"""
…,…
724333,"""Dal pater familias alla multigenitorialità: cinquanta anni del diritto di famiglia italiano"""
724369,"""Capitolo quarantaduesimo Gli enti pubblici"""
724125,"""The Accelerated Access Initiative to Quality Formal Education for Syrian Refugee Children (AAI). Le…"
724571,"""La Costituzione di Dobbs v. Jackson Women’s Health Organization. Un’analisi critica"""


In [11]:
from string import Template
from SPARQLWrapper import SPARQLWrapper, CSV, JSON
from SPARQLWrapper.SPARQLExceptions import QueryBadFormed
from urllib.error import HTTPError
from requests import get


sparql = SPARQLWrapper("https://test.opencitations.net/meta/sparql")

findings = []

def get_type(doi):
    HTTP_HEADERS = {"authorization": "8c0f10ec-f033-4e81-a4ec-818a0232c1f8"}
    API_CALL = "https://w3id.org/oc/meta/api/v1/metadata/{}"

    response = get(API_CALL.format('doi:'+doi), headers=HTTP_HEADERS)

    return response.json()[0]['type']

titles_df = pl.DataFrame()


for iris_id, title in tqdm(iris_noid_titles[140:].iter_rows(), total=len(iris_noid_titles[140:])):
    title = title.replace('\r', ' ').replace('\n', '').replace('"', "'")
    if len(title.split()) < 3:
        continue
    try:
        sparql.setQuery(f"""
                        PREFIX datacite: <http://purl.org/spar/datacite/>
                        PREFIX dcterms: <http://purl.org/dc/terms/>
                        PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
                        PREFIX fabio: <http://purl.org/spar/fabio/>
                        SELECT ?entity ?doi ?type
                        WHERE {{
                            ?entity dcterms:title "{title}" ;
                                a ?type.
                            ?entity datacite:hasIdentifier ?identifier.
                            ?identifier datacite:usesIdentifierScheme datacite:doi.
                            ?identifier literal:hasLiteralValue ?doi.
                        FILTER (?type != fabio:Expression)
                        }}""")
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()

        if results["results"]["bindings"]:
            print(title, results["results"]["bindings"])
            for result in results["results"]["bindings"]:
                entity = result["entity"]["value"]
                doi = result["doi"]["value"]
                type = result["type"]["value"]
                findings.append({'title': title, 'omid': entity.replace("https://w3id.org/oc/meta/", 'omid:'), 'id': "doi:"+doi, 'type': get_type(doi), 'iris_id': iris_id})

    except (QueryBadFormed, HTTPError) as e:
        continue

  0%|          | 0/103341 [00:00<?, ?it/s]

  0%|          | 8/103341 [00:01<4:01:26,  7.13it/s]

Hybrid Modelling Of A Car Driveline For Servo-Actuated Gear Shift [{'entity': {'type': 'uri', 'value': 'https://w3id.org/oc/meta/br/061403332327'}, 'doi': {'type': 'literal', 'value': '10.1109/isie.2005.1528916'}, 'type': {'type': 'uri', 'value': 'http://purl.org/spar/fabio/ProceedingsPaper'}}]


  0%|          | 51/103341 [00:08<4:34:13,  6.28it/s] 


KeyboardInterrupt: 

In [7]:
findings

[{'title': 'Hybrid Modelling Of A Car Driveline For Servo-Actuated Gear Shift',
  'omid': 'omid:br/061403332327',
  'id': 'doi:10.1109/isie.2005.1528916',
  'type': 'proceedings article',
  'iris_id': 55620}]

In [8]:
findings_df = pl.DataFrame(findings)
findings_df

title,omid,id,type,iris_id
str,str,str,str,i64
"""Hybrid Modelling Of A Car Driveline For Servo-Actuated Gear Shift""","""omid:br/061403332327""","""doi:10.1109/isie.2005.1528916""","""proceedings article""",55620


In [10]:
lf_iim = pl.scan_parquet('data/iris_in_meta/*.parquet')
lf_iim.head().collect()

id,title,type,omid,iris_id
str,str,str,str,i32
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547""",776472
"""doi:10.1093/annonc/mdx033""","""European Cancer Mortality Predictions For The Year 2017, With Focus On Lung Cancer""","""journal article""","""omid:br/062503328173""",388298
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812""",320811
"""doi:10.1093/annonc/mdw306""","""Global Trends And Predictions In Ovarian Cancer Mortality""","""journal article""","""omid:br/062503328364""",632727
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314""",301915


#### dask

In [91]:
iris_noid_titles = (
    df[['IDE_DOI', 'IDE_ISBN', 'TITLE']]
    .filter(
        pl.col('IDE_DOI').is_null() & pl.col('IDE_ISBN').is_null()
    )
)['TITLE'].to_list()

In [45]:
len(iris_noid_titles)

105327

In [9]:
zip_file = ZipFile('data/csv_openalex.zip')
csvs = ['zip://'+n for n in zip_file.namelist() if n.endswith('.csv')]

In [13]:
ddf = dd.read_csv(csvs, storage_options={'fo': zip_file.filename}, usecols=['title'])

In [30]:
ddf.partitions[0].head()

[########################################] | 100% Completed | 101.88 ms


Unnamed: 0,title
0,Expression Of Cyclin-Dependent Kinases Inhibit...
1,Effects Of Treatment Of Rats With Indole-3-Car...
2,Parathyroid Hormone-Related Peptide And Cardio...
3,Oral Immunisation Of Chickens Using Cholera To...
4,"Influence Of Stage Classification, Tumor Diffe..."


In [None]:
import numpy as np


def compare_title_similarity(title, list_):
    res = process.extractOne(title, list_, scorer=fuzz.token_sort_ratio, score_cutoff=70)
    return title, res


def find_fuzzy(list_from_user, list_from_DB, score_cutoff: int):
    score_matrix = process.cdist(
        list_from_user,
        list_from_DB,
        processor=str.lower,
        scorer=fuzz.ratio,
        dtype=np.uint8,  # Output the score as uint8, which is faster.
        workers=-1,  # Use multithreading. -1 means use all cores.
        score_cutoff=score_cutoff,
    )

    results = []
    user_indices, db_indices = np.nonzero(score_matrix)
    for user_index_of_match, db_index_of_match in zip(user_indices, db_indices):
        results.append(
            {
                "user_index_of_match": user_index_of_match,
                "db_index_of_match": db_index_of_match,
                "user_item_of_match": list_from_user[user_index_of_match],
                "db_item_of_match": list_from_DB[db_index_of_match],
                "score_of_match": score_matrix[user_index_of_match, db_index_of_match],
            }
        )
    return results


In [48]:
#def starts_with(df):


title_ddf = ddf.partitions[0].title.str.startswith('Expression').compute()
title_ddf

AttributeError: 'Series' object has no attribute 'query'

In [158]:
lf_iim_null = pl.scan_parquet('data/iris_in_meta_no_id/*.parquet')

#lf_iim_null.select(pl.len()).collect().head()
#lf_iim_null.collect().head(20)

lf_iim_null = (
    lf_iim_null
    .select('title')
    .filter(
        (pl.col('title') == "").not_()
    )
)

lf_iim_null.select(pl.len()).collect()

len
u32
12352039


In [177]:
116007 + 12352039

12468046

In [159]:
for title in iris_noid_titles:
    matches = process.extract(title, lf_iim_null['title'].to_list(), scorer=fuzz.token_sort_ratio, limit=1)
    if matches:
        print(title, matches[0])

TypeError: 'LazyFrame' object is not subscriptable (aside from slicing)

Use `select()` or `filter()` instead.

In [27]:
def compare_title_similarity(title):
    res = process.extractOne(title, l, scorer=fuzz.token_sort_ratio, score_cutoff=70)
    return title, res

In [5]:


lf_im_null_res = (
    lf_im_null
    .select(['title'])
    .with_columns(
        (pl.col('title').map_elements(compare_title_similarity, return_dtype=pl.Int64)).alias('title_similarity')
    )
    .drop_nulls('title_similarity')
)

NameError: name 'lf_im_null' is not defined

In [11]:
lf_im_null_res = (
    lf_im_null
    .select(['title'])
    .filter(
        pl.col('title') != ""
    )
)

#lf_im_null.select(pl.len()).collect().head()
l = lf_im_null_res.collect()['title']

In [12]:
type(l)

polars.series.series.Series

In [17]:
len(noid_titles_list)

105327

In [22]:
from tqdm.contrib.concurrent import process_map


core_num = multiprocessing.cpu_count()

r = process_map(compare_title_similarity, noid_titles_list[:10], max_workers=core_num, chunksize=1)

print(r)


  0%|          | 0/10 [00:00<?, ?it/s]

[('Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures', ('[The Interferon System: Structure, Biology, Applications].', 58.33333333333333, 4574416)), ('Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria', None), ('Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni', ('Transformation Consistent Self-ensembling Model For Semi-supervised Medical Image Segmentation', 51.08695652173913, 5867090)), ("Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea", None), ('2005: agricoltura centro o fine del mondo?', ('Agricultural Policy Monitoring And Evaluation 2020', 54.347826086956516, 5804189)), ('Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista', None), ('Phaseguide Structures for Pipette Actuated Laminar Flow Based Selective Sample Re

In [23]:
r

[('Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures',
  ('[The Interferon System: Structure, Biology, Applications].',
   58.33333333333333,
   4574416)),
 ('Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria',
  None),
 ('Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni',
  ('Transformation Consistent Self-ensembling Model For Semi-supervised Medical Image Segmentation',
   51.08695652173913,
   5867090)),
 ("Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea",
  None),
 ('2005: agricoltura centro o fine del mondo?',
  ('Agricultural Policy Monitoring And Evaluation 2020',
   54.347826086956516,
   5804189)),
 ('Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista',
  None),
 ('Phaseguide Structures for Pipette Actuated La

In [20]:
noid_titles_list[:10]

['Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures',
 'Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria',
 'Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni',
 "Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea",
 '2005: agricoltura centro o fine del mondo?',
 'Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista',
 'Phaseguide Structures for Pipette Actuated Laminar Flow Based Selective Sample Recovery',
 'Studio del ciclo biologico di Myxobolus lentisuturalis (Myxozoa, Myxobolidae), parassita di Carassius auratus auratus.',
 'The Other Side of the Timing Equation: a Result of Clock Faults',
 'Ritratto di Augusto capite velato']

In [19]:
l[4574416]

'[The Interferon System: Structure, Biology, Applications].'

### RQ1. What is the coverage of the publications available in IRIS (strictly concerning research conducted within the University of Bologna) in OpenCitations Meta?

In [16]:
lf_iim = pl.scan_parquet('data/iris_in_meta/*.parquet')
#lf_iim.head().collect()
print(lf_iim.select(pl.len()).collect())
print(lf_iim.filter(pl.col('id').str.contains('doi')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('pmid')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('isbn')).select(pl.len()).collect().item())

shape: (1, 1)
┌────────┐
│ len    │
│ ---    │
│ u32    │
╞════════╡
│ 117764 │
└────────┘
115776
1078
910


In [17]:
lf_iim.head().collect()

id,title,type,omid,iris_id
str,str,str,str,i32
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547""",776472
"""doi:10.1093/annonc/mdx033""","""European Cancer Mortality Predictions For The Year 2017, With Focus On Lung Cancer""","""journal article""","""omid:br/062503328173""",388298
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812""",320811
"""doi:10.1093/annonc/mdw306""","""Global Trends And Predictions In Ovarian Cancer Mortality""","""journal article""","""omid:br/062503328364""",632727
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314""",301915


In [19]:
doi_join = lf_iim.filter(pl.col('id').str.contains('doi')).select('id').collect()['id'].to_list()

In [35]:
'doi:10.1007/jhep02(2013)043' in s

False

In [56]:
len(doi_join), len(doi_old)

# difference
dif = set(doi_old).difference(set(doi_join))
dif

{'doi:10.1007/jhep02(2013)043',
 'doi:10.1109/jestie.2022.3165985',
 'doi:10.1109/cseet.2017.37',
 'doi:10.1103/physrevlett.122.132003',
 'doi:10.1115/fedsm2014-21581',
 'doi:10.1007/jhep08(2020)047',
 'doi:10.1128/aac.01586-13',
 'doi:10.1109/tns.2006.874546',
 'doi:10.1007/jhep05(2021)093',
 'doi:10.1109/tdei.2007.4339484',
 'doi:10.1142/s0217751x15300227',
 'doi:10.1016/s0140-6736(94)92859-2',
 'doi:10.3280/fr2013-001006',
 'doi:10.1007/jhep07(2021)167',
 'doi:10.3167/cont.2020.080205',
 'doi:10.1142/s0219519405001631',
 'doi:10.1007/jhep09(2012)041',
 'doi:10.1158/0008-5472.can-09-2548',
 'doi:10.1016/b978-0-12-811865-8.00009-x',
 'doi:10.1103/physrevb.100.174416',
 'doi:10.1109/ursigass.2011.6050536',
 'doi:10.1056/nejmoa1402551',
 'doi:10.23736/s2784-8671.21.06928-5',
 'doi:10.1128/aac.01291-12',
 'doi:10.17660/actahortic.2009.831.16',
 'doi:10.1109/ijcb48548.2020.9304903',
 'doi:10.1109/cipls.2014.7007157',
 'doi:10.4230/lipics.concur.2019.37',
 'doi:10.1109/aicas.2019.8771527',

In [108]:
lf_iim.head(10).collect()

id,title,type,omid,iris_id
str,str,str,str,i32
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547""",776472
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314""",301915
"""doi:10.1093/annonc/mdw618""","""Dietary Acrylamide And The Risk Of Pancreatic Cancer In The International Pancreatic Cancer Case–Co…","""journal article""","""omid:br/062503328269""",388290
"""doi:10.1093/annonc/mdx033""","""European Cancer Mortality Predictions For The Year 2017, With Focus On Lung Cancer""","""journal article""","""omid:br/062503328173""",388298
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812""",320811
"""doi:10.1093/annonc/mdw306""","""Global Trends And Predictions In Ovarian Cancer Mortality""","""journal article""","""omid:br/062503328364""",632727
"""doi:10.1093/annonc/mdw649""","""Nivolumab-induced Cholangitic Liver Disease: A Novel Form Of Serious Liver Injury""","""journal article""","""omid:br/062503328245""",295455
"""doi:10.1002/sim.6576""","""Exact Optimum Coin Bias In Efron's Randomization Procedure""","""journal article""","""omid:br/0690744489""",207606
"""doi:10.1057/s41310-020-00071-4""","""Is The Market Surprised By The Surprise?""","""journal article""","""omid:br/0690104274""",450807
"""doi:10.1002/jmor.10182""","""Immunocytochemical And Autoradiographic Studies On The Process Of Keratinization In Avian Epidermis…","""journal article""","""omid:br/0690102756""",67863


In [124]:
lf_iim = pl.scan_parquet('data/iris_in_meta_pmid/*.parquet')
#lf_iim.head().collect()
lf_iim.select(pl.len()).collect()
print(lf_iim.filter(pl.col('id').str.contains('doi')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('pmid')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('isbn')).select(pl.len()).collect().item())

115065
1076
357


In [131]:
lf_iim.head().collect()

id,title,type,omid
str,str,str,str
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547"""
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812"""
"""doi:10.1093/annonc/mdw618""","""Dietary Acrylamide And The Risk Of Pancreatic Cancer In The International Pancreatic Cancer Case–Co…","""journal article""","""omid:br/062503328269"""
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314"""
"""doi:10.1093/annonc/mdw649""","""Nivolumab-induced Cholangitic Liver Disease: A Novel Form Of Serious Liver Injury""","""journal article""","""omid:br/062503328245"""


In [51]:
lf_iim = pl.scan_parquet('data/iris_in_meta/*.parquet')
lf_iim.head().collect()
lf_iim.select(pl.len()).collect()
print(lf_iim.filter(pl.col('id').str.contains('doi')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('pmid')).select(pl.len()).collect().item())
print(lf_iim.filter(pl.col('id').str.contains('isbn')).select(pl.len()).collect().item())
doi_old = lf_iim.filter(pl.col('id').str.contains('doi')).select('id').collect()['id'].to_list()

115065
1095
418


In [53]:
lf_iim.filter(pl.col('id') == 'doi:10.1007/jhep02(2013)043').collect()

id,title,type,omid
str,str,str,str
"""doi:10.1007/jhep02(2013)043""","""First Evidence For The Annihilation Decay Mode $ {B^{+}}\to D_s^{+}\phi $""","""journal article""","""omid:br/06301833411"""


In [758]:
join = lf_iim.join(dois_isbns_pmids_df.lazy(), on='id', how='inner')
join.select(pl.len()).collect()

len
u32
93159


In [341]:
lf_iim.head(10).collect()

id,title,type,omid
str,str,str,str
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547"""
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812"""
"""doi:10.1093/annonc/mdw618""","""Dietary Acrylamide And The Risk Of Pancreatic Cancer In The International Pancreatic Cancer Case–Co…","""journal article""","""omid:br/062503328269"""
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314"""
"""doi:10.1093/annonc/mdw649""","""Nivolumab-induced Cholangitic Liver Disease: A Novel Form Of Serious Liver Injury""","""journal article""","""omid:br/062503328245"""
"""doi:10.1093/annonc/mdx033""","""European Cancer Mortality Predictions For The Year 2017, With Focus On Lung Cancer""","""journal article""","""omid:br/062503328173"""
"""doi:10.1093/annonc/mdw306""","""Global Trends And Predictions In Ovarian Cancer Mortality""","""journal article""","""omid:br/062503328364"""
"""doi:10.1002/sim.6576""","""Exact Optimum Coin Bias In Efron's Randomization Procedure""","""journal article""","""omid:br/0690744489"""
"""doi:10.1057/s41310-020-00071-4""","""Is The Market Surprised By The Surprise?""","""journal article""","""omid:br/0690104274"""
"""doi:10.1002/anie.202008644""","""Tailoring Spectral And Photochemical Properties Of Bioinspired Retinal Mimics By In Silico Engineer…","""journal article""","""omid:br/0690109790"""


In [347]:
dois_isbns_pmids_df.head()

iris_id,id
i32,str
60422,"""10.1127/0935-1221/2006/0018-0223"""
608164,"""10.1039/d1nr06053h"""
685347,"""10.1097/MEG.0000000000002208"""
177192,"""10.1002/anie.201308459"""
288610,"""10.1001/jama.2010.1910"""


In [328]:
lf_iim.head().collect()

id,title,type,omid
str,str,str,str
"""doi:10.1002/widm.1511""","""A White Paper On Good Research Practices In Benchmarking: The Case Of Cluster Analysis""","""journal article""","""omid:br/06510108547"""
"""doi:10.1016/j.jmatprotec.2017.10.019""","""High Throughput Hybrid Laser Assisted Machining Of Sintered Reaction Bonded Silicon Nitride""","""journal article""","""omid:br/062503328812"""
"""doi:10.1093/annonc/mdw618""","""Dietary Acrylamide And The Risk Of Pancreatic Cancer In The International Pancreatic Cancer Case–Co…","""journal article""","""omid:br/062503328269"""
"""doi:10.1093/annonc/mdw430""","""Short, Full-Dose Adjuvant Chemotherapy (CT) In High-Risk Adult Soft Tissue Sarcomas (STS): Long-Ter…","""journal article""","""omid:br/062503328314"""
"""doi:10.1093/annonc/mdw649""","""Nivolumab-induced Cholangitic Liver Disease: A Novel Form Of Serious Liver Injury""","""journal article""","""omid:br/062503328245"""


In [None]:
#116578 

In [107]:
rq1 = lf_iim.select(pl.len()).collect()

print(rq1.item(), 'articoli provenienti da IRIS su', len(dois_isbns_pmids), 'sono presenti in OM')
print("{:.1f}%".format(rq1.item() / len(dois_isbns_pmids) * 100))

117764 articoli provenienti da IRIS su 225042 sono presenti in OM
52.3%


In [103]:
ids_list = (
    lf_iim
    .select('id')
)

ids_list.head().collect()

id
str
"""doi:10.1002/widm.1511"""
"""doi:10.1016/j.jmatprotec.2017.10.019"""
"""doi:10.1093/annonc/mdw618"""
"""doi:10.1093/annonc/mdw430"""
"""doi:10.1093/annonc/mdw649"""


### RQ2. Which are the types of publications that are better covered in OpenCitations Meta?

In [150]:
rq2 = lf_iim.group_by('type').len().sort('len', descending=True).with_columns(pl.col('type').str.replace(r"^$", 'no type')).collect()

print(rq2)

shape: (16, 2)
┌─────────────────────┬────────┐
│ type                ┆ len    │
│ ---                 ┆ ---    │
│ str                 ┆ u32    │
╞═════════════════════╪════════╡
│ journal article     ┆ 103900 │
│ proceedings article ┆ 5584   │
│ book chapter        ┆ 4473   │
│ no type             ┆ 1424   │
│ book                ┆ 917    │
│ …                   ┆ …      │
│ dataset             ┆ 6      │
│ dissertation        ┆ 2      │
│ book series         ┆ 1      │
│ series              ┆ 1      │
│ computer program    ┆ 1      │
└─────────────────────┴────────┘


### Index to Parquet

In [15]:
omids_list = (
    lf_iim
    .select('omid')
    .collect()
)['omid'].to_list()

len(omids_list)

116007

In [16]:
index_path = Path('data/oc_index')
file_names = [Path(index_path / archive) for archive in os.listdir(index_path)]

In [27]:
c = 0
for archive in tqdm(file_names):
    print(archive)
    zip_file = ZipFile(archive)

    csvs = ['zip://'+n for n in zip_file.namelist() if n.endswith('.csv')]

    c += len(csvs)
    ddf = dd.read_csv(csvs, storage_options={'fo': zip_file.filename}, usecols=['id', 'citing', 'cited'])
    ddf = ddf[ddf['cited'].isin(omids_list) | ddf['citing'].isin(omids_list)]
    ddf.to_parquet('data/index_in_iris/' + zip_file.filename + '.parquet', write_index=False) # 154 mins

  0%|          | 0/25 [00:00<?, ?it/s]

data/oc_index/2023_08_03-05T_0_1.zip
[########################################] | 100% Completed | 10m 6ss


  4%|▍         | 1/25 [10:06<4:02:40, 606.67s/it]

data/oc_index/2023_11_25T_0_1.zip
[########################################] | 100% Completed | 38.91 s


  8%|▊         | 2/25 [10:46<1:44:40, 273.07s/it]

data/oc_index/2023_08_20-23T_0_1.zip
[########################################] | 100% Completed | 236.61 s


 12%|█▏        | 3/25 [14:43<1:34:04, 256.56s/it]

data/oc_index/2023_06_05-07T_0_1.zip
[########################################] | 100% Completed | 162.50 s


 16%|█▌        | 4/25 [17:26<1:16:51, 219.57s/it]

data/oc_index/2023_07_13T_0_1.zip
[########################################] | 100% Completed | 80.52 s


 20%|██        | 5/25 [18:46<56:31, 169.55s/it]  

data/oc_index/2023_05_09-14T_0_1.zip
[########################################] | 100% Completed | 10m 43s


 24%|██▍       | 6/25 [29:30<1:44:47, 330.90s/it]

data/oc_index/2023_08_14-16T_0_1.zip
[########################################] | 100% Completed | 461.42 s


 28%|██▊       | 7/25 [37:12<1:52:06, 373.69s/it]

data/oc_index/2023_06_15-18T_0_1.zip
[########################################] | 100% Completed | 485.30 s


 32%|███▏      | 8/25 [45:18<1:55:58, 409.32s/it]

data/oc_index/2023_07-08_31-02T_0_1.zip
[########################################] | 100% Completed | 549.97 s


 36%|███▌      | 9/25 [54:28<2:00:54, 453.38s/it]

data/oc_index/2023_07_17-20T_0_1.zip
[########################################] | 100% Completed | 10m 11s


 40%|████      | 10/25 [1:04:40<2:05:33, 502.26s/it]

data/oc_index/2023_06_04-07T_0_1.zip
[########################################] | 100% Completed | 70.17 s


 44%|████▍     | 11/25 [1:05:50<1:26:21, 370.12s/it]

data/oc_index/2023_09_27T_0_1.zip
[########################################] | 100% Completed | 103.17 ms


 48%|████▊     | 12/25 [1:05:51<55:49, 257.63s/it]  

data/oc_index/2023_05_18-20T_0_1.zip
[########################################] | 100% Completed | 237.38 s


 52%|█████▏    | 13/25 [1:09:48<50:18, 251.58s/it]

data/oc_index/2023_08_08-11T_0_1.zip
[########################################] | 100% Completed | 10m 5ss


 56%|█████▌    | 14/25 [1:19:54<1:05:44, 358.55s/it]

data/oc_index/2023_07_26-29T_0_1.zip
[########################################] | 100% Completed | 10m 55s


 60%|██████    | 15/25 [1:30:50<1:14:42, 448.29s/it]

data/oc_index/2023_05_20-21T_0_1.zip
[########################################] | 100% Completed | 57.86 s


 64%|██████▍   | 16/25 [1:31:49<49:37, 330.86s/it]  

data/oc_index/2023_07_05-08T_0_1.zip
[########################################] | 100% Completed | 472.23 s


 68%|██████▊   | 17/25 [1:39:41<49:47, 373.47s/it]

data/oc_index/2023_08_06-08T_0_1.zip
[########################################] | 100% Completed | 540.19 s


 72%|███████▏  | 18/25 [1:48:42<49:25, 423.65s/it]

data/oc_index/2023_07_24-26T_0_1.zip
[########################################] | 100% Completed | 496.67 s


 76%|███████▌  | 19/25 [1:56:59<44:34, 445.69s/it]

data/oc_index/2023_10_24-28T_0_1.zip
[########################################] | 100% Completed | 491.88 s


 80%|████████  | 20/25 [2:05:11<38:19, 459.86s/it]

data/oc_index/2023_05_22-31T_0_1.zip
[########################################] | 100% Completed | 118.35 s


 84%|████████▍ | 21/25 [2:07:10<23:49, 357.44s/it]

data/oc_index/2023_06_19-27T_0_1.zip
[########################################] | 100% Completed | 19m 2ss


 88%|████████▊ | 22/25 [2:26:12<29:39, 593.01s/it]

data/oc_index/2023_08_17-19T_0_1.zip
[########################################] | 100% Completed | 196.92 s


 92%|█████████▏| 23/25 [2:29:30<15:48, 474.25s/it]

data/oc_index/2023_08_24-27T_0_1.zip
[########################################] | 100% Completed | 37.13 s


 96%|█████████▌| 24/25 [2:30:07<05:43, 343.18s/it]

data/oc_index/2023_06_13-14T_0_1-10.zip
[########################################] | 100% Completed | 248.34 s


100%|██████████| 25/25 [2:34:16<00:00, 370.25s/it]


### RQ 3. What is the amount of citations (according to OpenCitations Index) included in the IRIS publications that are involved in OpenCitations Meta (as citing entity and as cited entity)?

In [24]:
iii_path = Path('data/index_in_iris/')

iii_glob = glob.glob(str(iii_path / '*' / '*.parquet'))

In [25]:
lf_iii = pl.scan_parquet(iii_glob)
print(lf_iii.select(pl.len()).collect())
lf_iii.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 7859226 │
└─────────┘


id,citing,cited
str,str,str
"""oci:062102010944-06403538172""","""omid:br/062102010944""","""omid:br/06403538172"""
"""oci:062102010936-061901873468""","""omid:br/062102010936""","""omid:br/061901873468"""
"""oci:061302445980-0620582911""","""omid:br/061302445980""","""omid:br/0620582911"""
"""oci:062103078814-061903865447""","""omid:br/062103078814""","""omid:br/061903865447"""
"""oci:06403471424-06703722896""","""omid:br/06403471424""","""omid:br/06703722896"""


In [23]:
oc_omids_list = (
    lf_iim
    .select('omid')
    .collect()
)['omid'].to_list()

len(oc_omids_list)

110931

In [46]:
rq3a = (
    lf_iii
    .select('citing')
    .unique()
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

citing_list = rq3a.collect()['citing'].to_list()

rq3a.head().collect()
rq3a.select(pl.len()).collect()

len
u32
107704


In [47]:
rq3b = (
    lf_iii
    .select('cited')
    .unique()
    .filter(
        pl.col('cited').is_in(oc_omids_list)
    )
)

cited_list = rq3b.collect()['cited'].to_list()

rq3b.head().collect()
rq3b.select(pl.len()).collect()

len
u32
105135


In [79]:
cit_coming_from_iris = (
    lf_iii
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

print(cit_coming_from_iris.select(pl.len()).collect())

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 4295973 │
└─────────┘


### RQ 4. How many of these citations come from and go to publications that are not included in IRIS?

In [52]:
len(oc_omids_list)

116007

In [26]:
rq4a = (
    lf_iii
    .select('citing')
    #.unique('cited') ???
    .filter(
        ~pl.col('citing').is_in(oc_omids_list)
    )
)

In [27]:
print(rq4a.select(pl.len()).collect())
rq4a.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3830647 │
└─────────┘


citing
str
"""omid:br/062102010944"""
"""omid:br/062102010936"""
"""omid:br/061302445980"""
"""omid:br/062103078814"""
"""omid:br/0660213594"""


In [None]:
rq4b = (
    lf_iii
    .select('cited')
    .filter(
        ~pl.col('cited').is_in(oc_omids_list)
    )
)

In [116]:
print(rq4b.select(pl.len()).collect())
rq4b.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3939554 │
└─────────┘


cited
str
"""omid:br/06703722896"""
"""omid:br/06703722887"""
"""omid:br/06703722545"""
"""omid:br/06703723039"""
"""omid:br/06803732245"""


In [162]:
rq4 = (
    lf_iii
    .select('cited', 'citing')
    .filter(
        ~pl.col('cited').is_in(oc_omids_list),
        ~pl.col('citing').is_in(oc_omids_list)
    )
)

In [164]:
print(rq4.select(pl.len()).collect())

shape: (1, 1)
┌─────┐
│ len │
│ --- │
│ u32 │
╞═════╡
│ 0   │
└─────┘


### RQ 5. How many of these citations involve publications in IRIS as both citing and cited entities?

In [129]:
rq5a = (
    lf_iii
    .select('citing')
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

In [130]:
print(rq5a.select(pl.len()).collect())
rq5a.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 4295973 │
└─────────┘


citing
str
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""


In [121]:
rq5b = (
    lf_iii
    .select('cited')
    .filter(
        pl.col('cited').is_in(oc_omids_list)
    )
)

In [122]:
print(rq5b.select(pl.len()).collect())
rq5b.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3919672 │
└─────────┘


cited
str
"""omid:br/06403538172"""
"""omid:br/061901873468"""
"""omid:br/0620582911"""
"""omid:br/061903865447"""
"""omid:br/06502545532"""


In [133]:
rq5 = (
    lf_iii
    .select('citing', 'cited')
    .filter(
        pl.col('citing').is_in(oc_omids_list) & pl.col('cited').is_in(oc_omids_list)
    )
)

In [134]:
print(rq5.select(pl.len()).collect())
rq5.head().collect()

shape: (1, 1)
┌────────┐
│ len    │
│ ---    │
│ u32    │
╞════════╡
│ 356419 │
└────────┘


citing,cited
str,str
"""omid:br/06801833332""","""omid:br/06301833698"""
"""omid:br/06801833332""","""omid:br/06201835146"""
"""omid:br/06801833332""","""omid:br/06101850825"""
"""omid:br/06140651081""","""omid:br/061301899117"""
"""omid:br/061902028230""","""omid:br/06230928593"""


---

### Further usage

In [10]:
from src.read_meta import read_meta
from src.read_iris import read_iris

iim_df = read_meta('data/iris_in_meta')
iim_df.collect()

id,title,type,omid,iris_id
str,str,str,str,i32
"""doi:10.1002/wi…","""A White Paper …","""journal articl…","""omid:br/065101…",776472
"""doi:10.1093/an…","""European Cance…","""journal articl…","""omid:br/062503…",388298
"""doi:10.1016/j.…","""High Throughpu…","""journal articl…","""omid:br/062503…",320811
"""doi:10.1093/an…","""Global Trends …","""journal articl…","""omid:br/062503…",632727
"""doi:10.1093/an…","""Short, Full-Do…","""journal articl…","""omid:br/062503…",301915
…,…,…,…,…
"""doi:10.1002/si…","""Relational Eve…","""journal articl…","""omid:br/069074…",311099
"""doi:10.1016/j.…","""Effects Of New…","""journal articl…","""omid:br/069074…",54217
"""doi:10.1016/j.…","""Nonhypoellipti…","""journal articl…","""omid:br/069074…",130283
"""doi:10.1109/41…","""Matrix Convert…","""journal articl…","""omid:br/069074…",643741


In [8]:
iris_df

ITEM_ID,IDE_DOI,IDE_ISBN,IDE_PMID
i32,str,str,str
2766,"""10.14411/fp.20…",,"""18666414"""
3349,"""10.1016/j.econ…",,
3867,,"""9788876655746""",
4342,,"""8821724038""",
4585,,"""9788861328112""",
…,…,…,…
724609,"""10.1007/s11244…",,"""36405974"""
724546,,"""9788859622994""",
724623,"""10.3390/vetsci…",,"""36669046"""
724249,,"""9788898392117""",
