In [5]:
import glob
import os
import sys
import tempfile
from pathlib import Path
from zipfile import ZipFile

from dask.diagnostics import ProgressBar
import dask.dataframe as dd
from tqdm.auto import tqdm

import polars as pl
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(10)

sys.path.append('../src')
from iris import read_iris


from dask.diagnostics import ProgressBar
ProgressBar().register()

  from pandas.core import (


## Read Iris

In [6]:
iris_path = Path('/run/media/leo/b9827f00-f8da-4458-bc94-97b28f1a0989/iris-data-2025-05-30.zip')

We read the Master and Identifier tables of the Iris dataset and we join them by the ITEM_ID. We then filter this dataframe to keep only the entries that have at least a non-null DOI, ISBN or PMID. 

We also keep only the OWNING_COLLECTION column to denote the labels of the types of the entries, because of computational efficiency

In [7]:
df_iris_master = pl.read_csv(ZipFile(iris_path).open("POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_MASTER_ALL.csv").read())
df_iris_identifier = pl.read_csv(ZipFile(iris_path).open("POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_IDENTIFIER.csv").read(), columns=['ITEM_ID', 'IDE_DOI', 'IDE_ISBN', 'IDE_PMID'], ignore_errors=True, schema_overrides={'ITEM_ID': pl.Int64, 'IDE_DOI': pl.Utf8, 'IDE_ISBN': pl.Utf8, 'IDE_PMID': pl.Utf8})
df_iris_relation = pl.read_csv(ZipFile(iris_path).open('POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_RELATION.csv').read(), columns=['ITEM_ID', 'REL_ISPARTOFBOOK', 'REL_ISPARTOFJOURNAL'],
                                infer_schema_length=None)
df_iris_description = pl.read_csv(ZipFile(iris_path).open('POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_DESCRIPTION.csv').read(), columns=['ITEM_ID', 'DES_ALLPEOPLE', 'DES_NUMBEROFAUTHORS'])
df_iris_publisher = pl.read_csv(ZipFile(iris_path).open('POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_PUBLISHER.csv').read(), columns=['ITEM_ID', 'PUB_NAME', 'PUB_PLACE', 'PUB_COUNTRY'])


df = df_iris_identifier.join(df_iris_master, on='ITEM_ID', how='inner')#.join(df_iris_relation, on='ITEM_ID', how='inner')
#df_authors = df.join(df_iris_description, on='ITEM_ID', how='inner')



# a df where for each br at least one of the identifiers is not null
df_filtered = df.filter(pl.col('IDE_DOI').is_not_null() | pl.col('IDE_ISBN').is_not_null() | pl.col('IDE_PMID').is_not_null())[
    ['ITEM_ID', 'IDE_DOI', 'IDE_ISBN', 'IDE_PMID', 'OWNING_COLLECTION']#, 'REL_ISPARTOFBOOK', 'REL_ISPARTOFJOURNAL']
]

df_noid =  df.filter(pl.col('IDE_DOI').is_null() & pl.col('IDE_ISBN').is_null() & pl.col('IDE_PMID').is_null())
print('original len: ', df.__len__(), '| len after filtering out entities without PIDs: ', df_filtered.__len__())
df_filtered.head()

original len:  402166 | len after filtering out entities without PIDs:  263133


ITEM_ID,IDE_DOI,IDE_ISBN,IDE_PMID,OWNING_COLLECTION
i64,str,str,str,i64
60479,,"""8883125150""",,50
82956,,"""88.387.3686.3; 88.387.3687.1""",,57
70006,"""10.1441/13328""",,,35
73478,"""10.2110/palo.2005.p05-020r""",,,35
81464,,""" 0769525881""",,57


We see that 3 entries do not have a type assigned to them:

In [8]:
df_iris_master.filter(pl.col('OWNING_COLLECTION').is_null())

ITEM_ID,DATE_ISSUED_YEAR,TITLE,OWNING_COLLECTION,OWNING_COLLECTION_DES
i64,i64,str,i64,str
813996,2023,"""Rare predicted loss-of-function variants of type I IFN immunity genes are associated with life-threa…",,
843497,2024,"""Quantifying Generalizations: Exploring the Divide Between Human and LLMs' Sensitivity to Quantificat…",,
718686,9999,"""Culture, Fashion, and Society Notebook""",,


## Extract the list of PIDs

We extract a list of all the PIDs for each entry of the filtered IRIS dataframe by extracting the values of the columns `IDE_DOI`, `IDE_ISBN`, and `IDE_PMID`. These ids are normalized and the rotten ones are filtered out. Finally, they are all stored in a single list that we'll use to filter the Meta dump.

In [9]:
dois = df_filtered.select('ITEM_ID', 'IDE_DOI', 'OWNING_COLLECTION').drop_nulls('IDE_DOI')

filtered_dois = (
    dois
    .with_columns(('doi:'+pl.col('IDE_DOI').str.extract(r'(10\.\d{4,}\/[^,\s;]*)').str.to_lowercase()).alias('id'))
    .drop_nulls('id')
    .drop('IDE_DOI')
    .rename({'ITEM_ID': 'iris_id'})
)

print(f'Final DOIs count: {filtered_dois.shape[0]}. Removed {dois.shape[0] - filtered_dois.shape[0]} DOIs without a valid DOI.')

Final DOIs count: 183908. Removed 238 DOIs without a valid DOI.


In [10]:
pmids = df_filtered.select('ITEM_ID', 'IDE_PMID', 'OWNING_COLLECTION').drop_nulls('IDE_PMID')

filtered_pmids = (
    pmids
    .filter(
        ~pl.col('IDE_PMID').str.contains('PMC')
        )
    .with_columns(('pmid:'+pl.col('IDE_PMID').str.extract(r'0*([1-9][0-9]{1,8})', 1).str.to_lowercase()).alias('id'))
    .drop_nulls('id')
    .drop('IDE_PMID')
    .rename({'ITEM_ID': 'iris_id'})
)

print(f'Final PMIDs count: {filtered_pmids.shape[0]}. Removed {pmids.shape[0] - filtered_pmids.shape[0]} PMIDs without a valid PMID.')

Final PMIDs count: 59815. Removed 6 PMIDs without a valid PMID.


In [11]:
isbns = df_filtered.select('ITEM_ID', 'IDE_ISBN', 'OWNING_COLLECTION').drop_nulls('IDE_ISBN')

filtered_isbns = (
    isbns
    .with_columns(
        ('isbn:'+pl.col('IDE_ISBN').str.extract_all(r'(ISBN[-]*(1[03])*[ ]*(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})').list.first().str.replace_all(r'[- ]', '').str.to_lowercase()).alias('id')
    )
    .drop_nulls('id')
    .drop('IDE_ISBN')
    .rename({'ITEM_ID': 'iris_id'})
)

print(f'Final ISBNs count: {filtered_isbns.shape[0]}. Removed {isbns.shape[0] - filtered_isbns.shape[0]} ISBNs without a valid ISBN.')

Final ISBNs count: 42849. Removed 50644 ISBNs without a valid ISBN.


In [12]:
dois_pmids_isbns_list = pl.concat([filtered_dois, filtered_pmids, filtered_isbns]).rename({'OWNING_COLLECTION': 'iris_type'})

print('number of total PIDs:', len(dois_pmids_isbns_list))
dois_pmids_isbns_list.head()

number of total PIDs: 286572


iris_id,iris_type,id
i64,i64,str
70006,35,"""doi:10.1441/13328"""
73478,35,"""doi:10.2110/palo.2005.p05-020r"""
157248,35,"""doi:10.1007/jhep07(2012)133"""
155300,40,"""doi:10.1016/s0021-9290(12)70228-2"""
156226,35,"""doi:10.1016/j.compstruct.2011.10.026"""


In [13]:
dois_pmids_isbns_list.filter(pl.col('id').is_null())

iris_id,iris_type,id
i64,i64,str


In [14]:
dois_pmids_isbns_list.group_by(pl.col('id').str.slice(0, 4)).agg(pl.count('id').alias('count')).sort('count').head()

id,count
str,u32
"""isbn""",42849
"""pmid""",59815
"""doi:""",183908


#### Examples of invalid PIDs

In [15]:
# find dois that contain uppercase letters
isbns.filter(pl.col('IDE_ISBN').str.contains(r'[A-Z]')).head()

ITEM_ID,IDE_ISBN,OWNING_COLLECTION
i64,str,i64
77859,"""9,78888E+12""",57
157294,"""9,78275E+12""",61
157237,"""9,78888E+12""",57
156509,"""9,78162E+12""",59
156810,"""9,78888E+12""",41


In [16]:
filtered_isbns.filter(pl.col('id').str.contains(r'888809556x')).head()

iris_id,OWNING_COLLECTION,id
i64,i64,str
55139,49,"""isbn:888809556x"""
79343,49,"""isbn:888809556x"""


In [17]:
# dois that got filtered out because invalid
doi_anti = dois.join(filtered_dois, left_on='ITEM_ID', right_on='iris_id', how='anti')
doi_anti

ITEM_ID,IDE_DOI,OWNING_COLLECTION
i64,str,i64
8576,"""WOS:000340333900012""",35
192899,"""10.193/infdis/jiu617""",35
174463,"""10.746/-fei-XI-02-13_11""",35
176326,"""10.978.886760/1448""",41
178153,"""10.978.886633/1223""",50
…,…,…
8575,"""WOS:000335369200043""",35
8579,"""2,99749E+11""",35
10144,"""10.978.88490/03376""",41
10211,"""p.2013.12.005""",35


In [18]:
pmid_anti = pmids.join(filtered_pmids, left_on='ITEM_ID', right_on='iris_id', how='anti')
pmid_anti

ITEM_ID,IDE_PMID,OWNING_COLLECTION
i64,str,i64
554064,"""PMC7261988""",35
516464,"""PMC7372170""",35
369655,"""PMC6186108""",35
153706,"""PMC 4874964""",35
380644,"""PMC2206475""",38
926697,"""4""",35


In [19]:
isbn_anti = isbns.join(filtered_isbns, left_on='ITEM_ID', right_on='iris_id', how='anti')
isbn_anti

ITEM_ID,IDE_ISBN,OWNING_COLLECTION
i64,str,i64
82956,"""88.387.3686.3; 88.387.3687.1""",57
77859,"""9,78888E+12""",57
157294,"""9,78275E+12""",61
157237,"""9,78888E+12""",57
156509,"""9,78162E+12""",59
…,…,…
940793,"""9,78889E+12""",49
940799,"""9,78291E+12""",47
940804,"""9,78291E+12""",47
941422,"""9,78887E+12""",41


---

## Deduplication

We now have a list of PIDs. However, we have lots of duplicates in the list, both from the iris dataset (e.g. an iris BR has multiple PIDs) and from the external ids (multiple iris BRs that share the same PID).

Let's deal with the former case first:

### 1. Deduplication of iris BRs

Our aim is to keep only a single PID for each iris BR. We will keep the DOI if it exists, otherwise the PMID, and finally the ISBN.

In [20]:
# number of BRs with multiple types of PIDs in the list
dois_pmids_isbns_list.filter(pl.col('iris_id').is_duplicated()).select(pl.len())

len
u32
136636


In [21]:
# example:

# take a look at the BR from IRIS with ID 360
dois_pmids_isbns_list.filter(pl.col('iris_id').is_duplicated()).filter(pl.col('iris_id') == 360).sort('iris_id')

iris_id,iris_type,id
i64,i64,str
360,41,"""doi:10.1007/0-387-30394-4_33"""
360,41,"""isbn:0387298118"""


In [22]:
# both the DOI and ISBN have been extracted from the BR, so it appears twice in the list of PIDs
df_filtered.filter(pl.col('ITEM_ID') == 360)

ITEM_ID,IDE_DOI,IDE_ISBN,IDE_PMID,OWNING_COLLECTION
i64,str,str,str,i64
360,"""10.1007/0-387-30394-4_33""","""0-387-29811-8""",,41


In [23]:
dpi_dupes_iris = dois_pmids_isbns_list.filter(pl.col("iris_id").is_duplicated()).sort("iris_id").with_columns(pl.col('iris_type'))#.replace_strict(type_dict))
dpi_dupes_iris_grouped = (dpi_dupes_iris.group_by('iris_id').len().sort('len'))
print('We found', dpi_dupes_iris_grouped.select(pl.len()).item(), 'iris entries that are tied to more than one external identifier')
dpi_dupes_iris_grouped.select(pl.col('len').value_counts()).unnest('len').rename({'len': '# of dupes', 'count': '# of ids'})

We found 68247 iris entries that are tied to more than one external identifier


# of dupes,# of ids
u32,u32
2,68105
3,142


The elements that share the same iris ID all have the same type, so there's no need to create heuristics to decide which one to keep:


In [24]:
same_type = dpi_dupes_iris.group_by("iris_id").agg([
    pl.col("iris_type").n_unique().alias("unique_count")
])
different_types = same_type.filter(pl.col("unique_count") != 1)
different_types

iris_id,unique_count
i64,u32


To resolve this issue, we keep only the first occurrence of each external id in the list. Given how the list has been constructed, this creates a hierarchical order of preference for the external ids: DOI > PMID > ISBN.

In [25]:
print('num. of ids:', dois_pmids_isbns_list.__len__())
dois_pmids_isbns_filtered = dois_pmids_isbns_list.unique('iris_id', keep='first', maintain_order=True)
print('num. of ids after unique id filtering:', dois_pmids_isbns_filtered.__len__())

num. of ids: 286572
num. of ids after unique id filtering: 218183


In [26]:
dois_pmids_isbns_filtered.group_by(pl.col('id').str.slice(0, 4)).agg(
    pl.len()
).sort('len', descending=True)

id,len
str,u32
"""doi:""",183908
"""isbn""",32034
"""pmid""",2241


### 2. filter out multiple iris entries with the same external id

Let's now deal with the duplicates from the external ids:

In [27]:
#examples:

pids_dupes_df = dois_pmids_isbns_filtered.filter(pl.col("id").is_duplicated()).sort("id").with_columns(pl.col('iris_type'))#.replace(type_dict))
print(pids_dupes_df.n_unique('id'), 'unique PIDs')
pids_dupes_df

# as you can see, the same PIDs are associated to different IRIS BRs

21351 unique PIDs


iris_id,iris_type,id
i64,i64,str
779900,35,"""doi:10.1001/archderm.130.4.522"""
780022,35,"""doi:10.1001/archderm.130.4.522"""
853731,35,"""doi:10.1001/archderm.130.4.522"""
779923,35,"""doi:10.1001/archderm.132.2.231"""
780060,35,"""doi:10.1001/archderm.132.2.231"""
…,…,…
735599,35,"""pmid:7692192"""
734698,35,"""pmid:7692192"""
733789,35,"""pmid:7692192"""
774883,35,"""pmid:9276009"""


In [28]:
pids_dupes_df_grouped = (pids_dupes_df.group_by('id').len().sort('len'))
print('We found', pids_dupes_df.select(pl.col('iris_id').unique().len()).item(), 'external ids that are tied to more than one iris entry')
print()
print('The external identifiers appear with the following distribution:')
pids_dupes_df_grouped.select(pl.col('len').value_counts()).unnest('len').rename({'len': '# of dupes', 'count': '# of ext. ids'})

We found 56543 external ids that are tied to more than one iris entry

The external identifiers appear with the following distribution:


# of dupes,# of ext. ids
u32,u32
2,15409
3,2936
4,1169
5,1060
6,267
…,…
50,1
51,1
59,1
66,1


As you can see from the table above, the majority of the external id duplicates are pairs, but we have cases in which the same external id is shared by up to 129 iris entries!! 

The vast majority of the duplicates seem to be coming from the ISBNs:

In [29]:
pids_dupes_df_grouped.with_columns(pl.col('id').str.extract(r'^(doi|pmid|isbn)').alias('id_type')).select(pl.col('id_type').value_counts()).unnest('id_type').sort('id_type', descending=False).rename({'id_type': 'id type', 'count': '# of ext. ids'})
#dpi_dupes_id_grouped

id type,# of ext. ids
str,u32
"""doi""",16049
"""isbn""",5191
"""pmid""",111


In [30]:
pids_dupes_df.with_columns(pl.col('id').str.extract(r'^(doi|pmid|isbn)').alias('id_type')).select(pl.col('id_type').value_counts()).unnest('id_type').rename({'id_type': 'id type', 'count': '# of duplicate ext. ids'})

id type,# of duplicate ext. ids
str,u32
"""isbn""",17583
"""doi""",38729
"""pmid""",231


We deal with deduplicating the three different types of id in three different ways:

##### DOI deduplication

In [31]:
doi_dupes_df = pids_dupes_df.filter(pl.col('id').str.starts_with('doi:'))
print(doi_dupes_df.__len__())
doi_dupes_df
#doi_metadata_search = df.join(doi_dupes_df, left_on='ITEM_ID', right_on='iris_id').join(df_iris_description, on='ITEM_ID', how='inner').join(df_iris_relation, on='ITEM_ID', how='inner').join(df_iris_publisher, on='ITEM_ID', how='inner')

38729


iris_id,iris_type,id
i64,i64,str
779900,35,"""doi:10.1001/archderm.130.4.522"""
780022,35,"""doi:10.1001/archderm.130.4.522"""
853731,35,"""doi:10.1001/archderm.130.4.522"""
779923,35,"""doi:10.1001/archderm.132.2.231"""
780060,35,"""doi:10.1001/archderm.132.2.231"""
…,…,…
809529,35,"""doi:10.7759/cureus.34804"""
6884,57,"""doi:10.7873/date.2014.155"""
7042,57,"""doi:10.7873/date.2014.155"""
542790,35,"""doi:10.9758/cpn.2020.18.4.484"""


For what concernes duplicates with the same type, we keep only the first one encountered.

In [32]:
keep_doi = doi_dupes_df.unique(['iris_type', 'id'], keep='first', maintain_order=True)
print(keep_doi.filter(pl.col('id').is_duplicated()).__len__(), 'duplicates remanining;', keep_doi.filter(pl.col('id').is_duplicated()).n_unique('id'), 'unique ids')

2521 duplicates remanining; 1234 unique ids


This leaves 140 duplicate elements (291 entities) with different types. Among each of these we try to pick what we consider the 'container' element by sorting each group of duplicates based on a priority mapping of the types of the elements as described in the IRIS dataset and finally picking the first one. 

We don't need to specify a priority for each and every type present in the dataset, as per our observations the most common source of problems are the ones tied to a restricted set of types.
These problems are also often not coherent among the dataset, so finding a container for each of them is not really feasible. In the harder cases we just keep the first one encountered.

In [33]:
def doi_heuristic(group):
    priority = {
        35: 1, 50: 2, 41: 3, 57: 4
    }

    sorted_iris_types = group.sort(pl.col("iris_type").replace_strict(priority, default=float('inf')))
    return sorted_iris_types.head(1)

keep_doi = keep_doi.group_by("id").map_groups(doi_heuristic)
print('number of doi duplicates remaining: ', keep_doi.filter(pl.col('id').is_duplicated()).__len__())

number of doi duplicates remaining:  0


In [34]:
doi_dupes_list = doi_dupes_df.select('iris_id')
drop_doi = doi_dupes_list.join(keep_doi, on='iris_id', how='anti')

Finally, we remove the dupicates DOIs from the original list of external ids.

In [35]:
dois_pmids_isbns_filtered = dois_pmids_isbns_filtered.join(drop_doi, on='iris_id', how='anti')
dois_pmids_isbns_filtered.__len__()

195503

#### PMID deduplication

In [36]:
pmid_dupes_df = pids_dupes_df.filter(pl.col('id').str.starts_with('pmid:'))
pmid_dupes_df

iris_id,iris_type,id
i64,i64,str
811923,35,"""pmid:10"""
679280,35,"""pmid:10"""
923510,35,"""pmid:10"""
936383,35,"""pmid:10"""
728542,35,"""pmid:14524611"""
…,…,…
735599,35,"""pmid:7692192"""
734698,35,"""pmid:7692192"""
733789,35,"""pmid:7692192"""
774883,35,"""pmid:9276009"""


In [37]:
pmid_dupes_df.group_by('iris_type').agg(pl.len())

iris_type,len
i64,u32
35,218
40,7
57,2
38,4


In [38]:
pmid_dupes_df.group_by(
    'id').agg(
        pl.len(),
        pl.col('iris_type').unique()
        ).sort('len', descending=True).filter(pl.col('iris_type').list.len() > 1)

# articles (35) are in all ambiguous duplicates

id,len,iris_type
str,u32,list[i64]
"""pmid:21341571""",2,"[35, 57]"
"""pmid:23749883""",2,"[35, 38]"
"""pmid:15614151""",2,"[35, 38]"
"""pmid:21273609""",2,"[35, 40]"
"""pmid:16437895""",2,"[35, 40]"
"""pmid:18560115""",2,"[35, 57]"


In [40]:
print(pmid_dupes_df.filter(pl.col('id').is_duplicated()).__len__(), 'duplicates remanining;', pmid_dupes_df.filter(pl.col('id').is_duplicated()).n_unique('id'), 'unique ids')
keep_pmid = pmid_dupes_df.unique(['iris_type', 'id'], keep='first', maintain_order=True)
print('----')
print(keep_pmid.filter(pl.col('id').is_duplicated()).__len__(), 'duplicates remanining;', keep_pmid.filter(pl.col('id').is_duplicated()).n_unique('id'), 'unique ids')

231 duplicates remanining; 111 unique ids
----
12 duplicates remanining; 6 unique ids


In [42]:
def pmid_heuristic(group):
    priority = {
        35: 1
    }

    sorted_iris_types = group.sort(pl.col("iris_type").replace_strict(priority, default=float('inf')))
    return sorted_iris_types.head(1)

keep_pmid = keep_pmid.group_by("id").map_groups(pmid_heuristic)
print('number of doi duplicates remaining: ', keep_pmid.filter(pl.col('id').is_duplicated()).__len__())

number of doi duplicates remaining:  0


In [43]:
pmid_dupes_list = pmid_dupes_df.select('iris_id')
drop_pmid = pmid_dupes_list.join(keep_pmid, on='iris_id', how='anti')

In [44]:
dois_pmids_isbns_filtered = dois_pmids_isbns_filtered.join(drop_pmid, on='iris_id', how='anti')
dois_pmids_isbns_filtered.__len__()

195383

#### Deduplicate ISBN

In [240]:
isbn_dupes_df = pids_dupes_df.filter(pl.col('id').str.starts_with('isbn:'))
print(isbn_dupes_df.__len__(), '| unique isbns: ', isbn_dupes_df.unique('id').__len__())

#isbn_search = df.join(doi_dupes_df, left_on='ITEM_ID', right_on='iris_id').join(df_iris_description, on='ITEM_ID', how='inner').join(df_iris_relation, on='ITEM_ID', how='inner').join(df_iris_publisher, on='ITEM_ID', how='inner')

#doi_search.sort('id').head(6)

17583 | unique isbns:  5191


We keep only the first occurrence of the entries with the same external id and type:

In [246]:
keep_isbn = isbn_dupes_df.unique(['iris_type', 'id'], keep='first', maintain_order=True)
print('the number of elements came down to', keep_isbn.__len__(), 'but there are still', keep_isbn.filter(pl.col('id').is_duplicated()).__len__(), 'duplicates')

the number of elements came down to 8326 but there are still 5693 duplicates


And then we pick the entries that have the same external id but different types based on a priority list. (note that, if not specified in the priority list, the heuristic will only keep the top-most entry like it has found it in the df.)

In [244]:
isbn_dupes_df.group_by(
    'id').agg(
        pl.len(),
        pl.col('iris_type').unique()
).sort('len', descending=True).filter(pl.col('iris_type').list.len() > 1)

id,len,iris_type
str,u32,list[i64]
"""isbn:9788812000326""",129,"[45, 50]"
"""isbn:8882947386""",66,"[41, 44]"
"""isbn:8843030086""",59,"[45, 50]"
"""isbn:8814124191""",51,"[41, 44, … 50]"
"""isbn:881414799x""",43,"[41, 50]"
…,…,…
"""isbn:8860260728""",2,"[50, 57]"
"""isbn:9783110228823""",2,"[41, 50]"
"""isbn:9788461736973""",2,"[57, 59]"
"""isbn:9783631733493""",2,"[41, 50]"


In [248]:
def isbn_heuristic(group):
    priority = {
        49: 1, 35: 2
    }

    sorted_iris_types = group.sort(pl.col("iris_type").replace_strict(priority, default=float('inf')))
    return sorted_iris_types.head(1)

keep_isbn = keep_isbn.group_by("id").map_groups(isbn_heuristic)
keep_isbn.__len__(), keep_isbn.filter(pl.col('id').is_duplicated()).__len__()

(5191, 0)

In [249]:
isbn_dupes_list = isbn_dupes_df.select('iris_id')
drop_isbn = isbn_dupes_list.join(keep_isbn, on='iris_id', how='anti')

In [250]:
dois_pmids_isbns_filtered = dois_pmids_isbns_filtered.join(drop_isbn, on='iris_id', how='anti')
dois_pmids_isbns_filtered.__len__()

182991

In [252]:
print('number of duplicates ext. ids: ', dois_pmids_isbns_filtered.filter(pl.col('id').is_duplicated()).__len__())

number of duplicates ext. ids:  0


## Create IIM

Now that we have a clean, deduplicated list of external ids, we can filter the Meta dump to keep only the entries that have at least one of these external ids. We then save this filtered dataframe to a parquet file.

In [None]:
meta_path = Path('../data/csv_openalex.zip')

In [None]:
dois_pmids_isbns_filtered

iris_id,iris_type,id
i64,i64,str
156671,41,"""doi:10.1688/9783866187337"""
148354,35,"""doi:10.1007/s00180-012-0319-z"""
146851,35,"""doi:10.1002/cmdc.201100471"""
147819,35,"""doi:10.1097/gme.0b013e318240fe3d"""
148141,57,"""doi:10.1109/aero.2012.6187311"""
…,…,…
684265,41,"""isbn:9781138935709"""
666452,41,"""isbn:9782753512344"""
708637,49,"""isbn:9788857553696"""
708664,41,"""isbn:9788822910172"""


In [None]:
def process_meta_zip(zip_path):
    zip_file = ZipFile(zip_path)
    files_list = [zipfile for zipfile in zip_file.namelist() if zipfile.endswith('.csv')]
    output_iim = Path("../data/iris_in_meta")

    #dois_pmids_isbns_lf = dois_pmids_isbns_list.lazy()
    dois_pmids_isbns_lf = dois_pmids_isbns_filtered.lazy()

    for csv_file in tqdm(files_list, desc="Processing Meta CSV files"):
        with zip_file.open(csv_file, 'r') as file:
            # kudos to https://vdavez.com/2024/01/how-to-use-scan_csv-with-a-file-like-object-in-polars/
            with tempfile.NamedTemporaryFile() as tf:
                tf.write(file.read())
                tf.seek(0)
                os.makedirs(output_iim, exist_ok=True)
                df = (
                    pl.scan_csv(tf.name)
                    .select(['id', 'title', 'type'])
                    .with_columns(
                        (pl.col('id').str.extract(r"(omid:[^\s]+)")).alias('omid'),
                        (pl.col('id').str.extract(r"((?:doi):[^\s\"]+)")).alias('doi'),
                        (pl.col('id').str.extract(r"((?:pmid):[^\s\"]+)")).alias('pmid'),
                        (pl.col('id').str.extract(r"((?:isbn):[^\s\"]+)")).alias('isbn'),
                    )
                    .with_columns(
                        pl.coalesce([pl.col('doi'), pl.col('pmid'), pl.col('isbn')]).alias('id')
                    )
                    .drop(['doi', 'pmid', 'isbn'])
                    .drop_nulls('id')
                    .join(dois_pmids_isbns_lf, on='id', how='inner')
                    .collect(streaming=True)
                )

            if not df.is_empty():
                df.write_parquet(os.path.join(output_iim, os.path.basename(csv_file).replace('.csv', '.parquet')))



process_meta_zip(meta_path)

Processing Meta CSV files:   0%|          | 0/28248 [00:00<?, ?it/s]

---

Analysing the Iris in Meta dataset, we see that another source of problems come from the Meta dataset itself, in which some of the entries have different OMID ids but share the same external id. Talking about this with pr. Peroni, we learned that this is in fact an issue in the Meta dump (that will be resolved in the near future).

In [None]:
lf_iim = pl.scan_parquet('../data/iris_in_meta/*.parquet')

lf_iim.select(pl.len()).collect()

len
u32
115083


In [None]:
#example of duplicates
lf_iim.filter(pl.col('id').str.contains('doi')).filter(pl.col('id').is_duplicated()).sort('id').head(6).collect(streaming=True)

id,title,meta_type,omid,iris_id,iris_type
str,str,str,str,i64,str


In [None]:
lf_iim = lf_iim_test#.filter(pl.col('iris_id').is_duplicated()).sort('iris_id').collect()

In [None]:
lf_iim_test.filter(pl.col('iris_id').is_duplicated()).sort('iris_id').collect()

id,title,type,omid,iris_id,iris_type
str,str,str,str,i64,i64
"""doi:10.1007/0-387-30394-4_33""","""The Teledoc2 Project: A Heterogeneous Infrastructure For International E-Learning""","""book chapter""","""omid:br/061503309586""",360,41
"""isbn:0387298118""","""Distributed Cooperative Laboratories: Networking, Instrumentation, And Measurements""","""book""","""omid:br/061503310035""",360,41
"""doi:10.1016/s0168-8278(10)61011-2""","""""","""""","""omid:br/06370122877""",2538,58
"""doi:10.1016/s0168-8278(10)61011-2""","""1010 Optib – A Multicenter Prospective Open Label Study On Tenofovir (Tdf) For Chronic Hepatitis B P…","""journal article""","""omid:br/06140390318""",2538,58
"""doi:10.1016/s2213-2600(14)70153-5""","""Non-invasive Positive Pressure Ventilation For The Treatment Of Severe Stable Chronic Obstructive Pu…","""journal article""","""omid:br/06804223745""",6791,35
…,…,…,…,…,…
"""isbn:9783031358968""","""Lecture Notes In Computer Science""","""book""","""omid:br/06330138899""",813390,57
"""doi:10.1007/978-3-031-12673-4_3""","""Foundation Models In Healthcare: Opportunities, Biases And Regulatory Prospects In Europe""","""book chapter""","""omid:br/06903455034""",814439,57
"""isbn:9783031126727""","""Electronic Government And The Information Systems Perspective""","""book""","""omid:br/06903455338""",814439,57
"""pmid:30575567""","""Psychometric Evaluation Of The Multidimensional Scale Of Perceived Social Support (MSPSS) In People …","""journal article""","""omid:br/06903912416""",816073,35


In [None]:
print(lf_iim.filter(pl.col('id').is_duplicated()).select(pl.len()).collect(streaming=True).item(), 'items are duplicates')

2121 items are duplicates


In [None]:
lf_iim.head().collect()

id,title,meta_type,omid,iris_id,iris_type
str,str,str,str,i64,str
"""doi:10.1007/s10334-004-0090-4""","""Versatile Coil Design And Positioning Of Transverse-Field RF Surface Coils For Clinical 1.5-T MRI Ap…","""journal article""","""omid:br/06101045684""",64745,"""1.01 Articolo in rivista"""
"""doi:10.1136/archdischild-2017-314663""","""Tricky Case Of Takayasu Arteritis In A Young Child Presenting With Heart Failure And Femoral Pulses""","""journal article""","""omid:br/061402023621""",349405,"""1.01 Articolo in rivista"""
"""doi:10.1016/j.jebo.2018.09.002""","""The Effect Of Experts’ Opinion On Prices Of Art Works: The Case Of Peter Brueghel The Younger""","""journal article""","""omid:br/06401411164""",624919,"""1.01 Articolo in rivista"""
"""doi:10.1126/sciadv.aar8195""","""The Baltic Sea As A Time Machine For The Future Coastal Ocean""","""journal article""","""omid:br/0620639739""",459905,"""1.01 Articolo in rivista"""
"""doi:10.1016/j.ccs.2011.11.008""","""The Rhetoric Of Cultural Policies And The Issue Of ‘Getting Things Done’: Bologna Cultural Capital 1…","""journal article""","""omid:br/062808746""",153851,"""1.01 Articolo in rivista"""


In [None]:
lf_iim.filter(pl.col('id').is_duplicated()).filter(pl.col('meta_type') == '').collect(streaming=True)

id,title,meta_type,omid,iris_id,iris_type
str,str,str,str,i64,str


Out of these duplicates, we see that all but 28 elements have the same type. Let us deal with these cases first:

In [None]:
same_id_multpile_type_count = lf_iim.filter(pl.col('id').is_duplicated()).collect(streaming=True).group_by("iris_id").agg([
    pl.col("meta_type").n_unique().alias("type_unique_count")
]).filter(pl.col("type_unique_count") != 1)
same_id_multpile_type_count.sort('type_unique_count')

iris_id,type_unique_count
i64,u32


Analyzing these 74 duplicates we see that there are a lot of entries with empty types and titles. Our first shot is to get rid of those entries.

In [None]:
different_omid_different_type = lf_iim.filter(pl.col('iris_id').is_in(same_id_multpile_type_count.get_column('iris_id'))).sort('id').collect(streaming=True)
different_omid_different_type

id,title,type,omid,iris_id,iris_type
str,str,str,str,i64,i64
"""doi:10.1007/jhep01(2019)113""","""New Axion Searches At Flavor Factories""","""report""","""omid:br/061903832621""",700389,35
"""doi:10.1007/jhep01(2019)113""","""New Axion Searches At Flavor Factories""","""journal article""","""omid:br/06601833597""",700389,35
"""doi:10.1007/jhep01(2019)186""","""Analytic Helicity Amplitudes For Two-Loop Five-Gluon Scattering: The Single-Minus Case""","""journal article""","""omid:br/06601833525""",560558,35
"""doi:10.1007/jhep01(2019)186""","""Analytic Helicity Amplitudes For Two-Loop Five-Gluon Scattering: The Single-Minus Case""","""""","""omid:br/06604294851""",560558,35
"""doi:10.1007/jhep09(2018)007""","""Search For Additional Neutral MSSM Higgs Bosons In The Τ Τ Final State In Proton-Proton Collisions A…","""""","""omid:br/06404321301""",391724,35
"""doi:10.1007/jhep09(2018)007""","""Search For Additional Neutral MSSM Higgs Bosons In The Τ Τ Final State In Proton-Proton Collisions A…","""journal article""","""omid:br/06701820208""",391724,35
"""doi:10.1007/jhep12(2018)019""","""Planar Master Integrals For The Two-Loop Light-Fermion Electroweak Corrections To Higgs Plus Jet Pro…","""journal article""","""omid:br/06701819583""",769982,35
"""doi:10.1007/jhep12(2018)019""","""Planar Master Integrals For The Two-Loop Light-Fermion Electroweak Corrections To Higgs Plus Jet Pro…","""""","""omid:br/06504258303""",769982,35
"""doi:10.1016/0277-5379(91)90071-k""","""""","""""","""omid:br/06510222862""",632062,35
"""doi:10.1016/0277-5379(91)90071-k""","""Smoking And Cancer With Emphasis On Europe.""","""journal article""","""omid:br/061503643868""",632062,35


In [None]:
to_drop = different_omid_different_type.filter(pl.col('type') == "")
print('bingo: the number of unique elements remains the same at', different_omid_different_type.join(to_drop, on='omid', how='anti').n_unique('id'), 'so we did not delete any unique value.')

bingo: the number of unique elements remains the same at 28 so we did not delete any unique value.


Analyzing the remaining entries, we see that there remain only 16 duplicates with different types.

In [None]:
different_omid_different_type2 = different_omid_different_type.join(to_drop, on='omid', how='anti')
same_id_multiple_type_count2 = different_omid_different_type2.filter(pl.col('id').is_duplicated()).group_by("iris_id").agg([
    pl.col("type").n_unique().alias("type_unique_count")
]).filter(pl.col("type_unique_count") != 1)
same_id_multiple_type_count2

iris_id,type_unique_count
i64,u32
74637,2
369184,2
134111,2
92438,2
666743,2
84288,2
700389,2
673035,2


In [None]:
different_omid_different_type2 = different_omid_different_type2.filter(pl.col('iris_id').is_in(same_id_multiple_type_count2.get_column('iris_id'))).sort('id')

We can see that 16 entries remain still. These have the same iris id, but different omid and most importantly different types. We decide to keep only the elements that align with the type of the iris dataset.

In [None]:
different_omid_different_type2.select(pl.col('iris_type').value_counts())

iris_type
struct[2]
"{35,4}"
"{41,10}"
"{42,2}"


In [None]:
dict_df = pl.DataFrame({
    "iris_type": [35, 41, 42],
    "type": ['journal article', 'book chapter', 'book chapter']
})

to_drop2 = different_omid_different_type2.join(dict_df, on=["iris_type", "type"], how='anti')
different_omid_different_type3 = different_omid_different_type2.join(to_drop2, on='omid', how='anti').filter(pl.col('id').is_duplicated()).sort('id')
different_omid_different_type3 # no duplicates !!

id,title,type,omid,iris_id,iris_type
str,str,str,str,i64,i64


In [None]:
to_drop = pl.concat([to_drop, to_drop2])
to_drop

id,title,type,omid,iris_id,iris_type
str,str,str,str,i64,i64
"""doi:10.1007/jhep01(2019)186""","""Analytic Helicity Amplitudes For Two-Loop Five-Gluon Scattering: The Single-Minus Case""","""""","""omid:br/06604294851""",560558,35
"""doi:10.1007/jhep09(2018)007""","""Search For Additional Neutral MSSM Higgs Bosons In The Τ Τ Final State In Proton-Proton Collisions A…","""""","""omid:br/06404321301""",391724,35
"""doi:10.1007/jhep12(2018)019""","""Planar Master Integrals For The Two-Loop Light-Fermion Electroweak Corrections To Higgs Plus Jet Pro…","""""","""omid:br/06504258303""",769982,35
"""doi:10.1016/0277-5379(91)90071-k""","""""","""""","""omid:br/06510222862""",632062,35
"""doi:10.1016/s0014-5793(97)01073-9""","""""","""""","""omid:br/06470234045""",660762,35
"""doi:10.1016/s0140-6736(06)68338-4""","""""","""""","""omid:br/06410213596""",113506,35
"""doi:10.1016/s0140-6736(16)31467-2""","""""","""""","""omid:br/06310328682""",287218,35
"""doi:10.1016/s0140-6736(17)32152-9""","""""","""""","""omid:br/06510324330""",336128,35
"""doi:10.1016/s0140-6736(18)32278-5""","""""","""""","""omid:br/061203943000""",405139,35
"""doi:10.1016/s0140-6736(18)32278-5""","""""","""""","""omid:br/0604396934""",405139,35


In [None]:
#remove the bunch of duplicates that have different types
lf_iim = lf_iim.join(to_drop.lazy(), on='omid', how='anti')

We can now merge all the entries that have the same external id and type. We will keep the first occurrence of each entry.

In [None]:
lf_iim = lf_iim.unique(['iris_type', 'id'], keep='first', maintain_order=True)
print(lf_iim.filter(pl.col('id').is_duplicated()).select(pl.len()).collect(streaming=True).item(), 'duplicates remanining')

0 duplicates remanining


In [None]:
lf_iim = lf_iim.unique('iris_id', keep='first')

In [None]:
lf_iim.filter(pl.col('iris_id').is_duplicated()).sort('iris_id').collect(streaming=True)

id,title,type,omid,iris_id,iris_type
str,str,str,str,i64,i64


In [None]:
lf_iim.select(pl.len()).collect()

len
u32
111510


---

## Create III

In [None]:
lf_iim = pl.scan_parquet('../data/iris_in_meta/iris_in_meta.parquet')

omids_list = (
    lf_iim
    .select('omid')
    .collect(streaming=True)
)['omid'].to_list()

len(omids_list)

115083

In [None]:
index_path = Path('../data/24356626')
#index_path = Path('/run/media/leo/0EAE24EE1463E70C/opencitations/index_dump/24356626')
output_dir = Path("../pp_data/iris_in_index")

In [None]:
file_names = [Path(index_path) / Path(archive) for archive in os.listdir(index_path)]

for archive in tqdm(file_names):
    zip_file = ZipFile(archive)

    csvs = ['zip://'+n for n in zip_file.namelist() if n.endswith('.csv')]

    ddf = dd.read_csv(csvs, storage_options={'fo': zip_file.filename}, usecols=['id', 'citing', 'cited'])
    ddf = ddf[ddf['cited'].isin(omids_list) | ddf['citing'].isin(omids_list)]
    ddf.to_parquet(output_dir / archive.stem, write_index=False)