In [98]:
import os
import re
import glob
import tempfile
from zipfile import ZipFile

import polars as pl
from rapidfuzz import process, fuzz
from pathlib import Path
from tqdm import tqdm
import multiprocessing

from functools import lru_cache
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()


pl.Config.set_fmt_str_lengths(100);

### IRIS DFs

In [99]:
df_iris_master = pl.read_csv('./data/iris-data-2024-03-14/ODS_L1_IR_ITEM_MASTER_ALL.csv', columns=['ITEM_ID', 'YEAR_PUBLISHED', 'TITLE'] ,dtypes={'ITEM_ID': pl.Int32, 'YEAR_PUBLISHED': pl.Utf8, 'TITLE': pl.Utf8})
df_iris_identifier = pl.read_csv('./data/iris-data-2024-03-14/ODS_L1_IR_ITEM_IDENTIFIER.csv', columns=['ITEM_ID', 'IDE_DOI', 'IDE_ISBN'] ,dtypes={'ITEM_ID': pl.Int32, 'IDE_DOI': pl.Utf8, 'IDE_ISBN': pl.Utf8})

df = df_iris_identifier.join(df_iris_master, on='ITEM_ID', how='inner')


df_filtered = df.filter(pl.col('IDE_DOI').is_not_null() | pl.col('IDE_ISBN').is_not_null())[
    ['ITEM_ID', 'IDE_DOI', 'IDE_ISBN']
]

df_filtered.head()

ITEM_ID,IDE_DOI,IDE_ISBN
i32,str,str
2766,"""10.14411/fp.2008.015""",
3349,"""10.1016/j.econlet.2011.12.096""",
3867,,"""9788876655746"""
4342,,"""8821724038"""
4585,,"""9788861328112"""


---
### Sanitize IRIS DOIs and ISBNs

In [100]:
dois = df_filtered.select('IDE_DOI').drop_nulls().unique()['IDE_DOI'].to_list()
print(len(dois), 'unique dois')

#filter and normalize the dois
doi_rule = re.compile(r'10\.\d{4,}\/[^,\s;]*')
not_doi = []
filtered_dois = []

for doi in dois:
    match = doi_rule.search(doi)
    if match:
        filtered_dois.append('doi:' + match.group())
    else:
        not_doi.append(doi)

print(len(filtered_dois), 'unique dois after filtering. ', len(not_doi), 'not dois discarded')

print('---'*10)

isbns = df_filtered.select('IDE_ISBN').drop_nulls().unique()['IDE_ISBN'].to_list()
print(len(isbns), 'unique isbns')

#filter and normalize the isbns
isbn_rule = re.compile(r'(ISBN[-]*(1[03])*[ ]*(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})') # ??? results to check
not_isbn = []
filtered_isbns = []

for isbn in isbns:
    if isbn_rule.search(isbn) is not None:
        filtered_isbns.append('isbn:' + isbn.replace('-', '').replace(' ', ''))
    else:
        not_isbn.append(isbn)

print(len(filtered_isbns), 'unique isbns after filtering. ', len(not_isbn), 'not isbns discarded')

dois_isbns = filtered_dois + filtered_isbns
dois_isbns = [id.lower() for id in dois_isbns]

print('==='*10)
print(len(dois_isbns), 'total identifiers')

130077 unique dois
129954 unique dois after filtering.  123 not dois discarded
------------------------------
49982 unique isbns
49405 unique isbns after filtering.  577 not isbns discarded
179359 total identifiers


---
### Meta to Parquet

In [4]:
meta_path = Path('data/csv_openalex.zip')

In [38]:
def process_meta_zip(zip_path):
    zip_file = ZipFile(zip_path)
    files_list = [zipfile for zipfile in zip_file.namelist() if zipfile.endswith('.csv')]
    output_dir = "data/iris_in_meta"
    output_dir2 = "data/iris_in_meta_no_id"

    for csv_file in tqdm(files_list):
        with zip_file.open(csv_file, 'r') as file:
            with tempfile.NamedTemporaryFile() as tf:
                tf.write(file.read()) # Write the csv file to the temporary file
                tf.seek(0)          # Start at the beginning of the temporary file
                os.makedirs(output_dir, exist_ok=True)
                os.makedirs(output_dir2, exist_ok=True)
                df = (
                    pl.scan_csv(tf.name)
                    .select(['id', 'title', 'type'])
                    .with_columns(
                        #(pl.col('id').str.extract(r"((?:doi):[^\s]+)")).alias('doi'),
                        #(pl.col('id').str.extract(r"((?:isbn):[^\s]+)")).alias('isbn'),
                        (pl.col('id').str.extract(r"(omid:[^\s]+)")).alias('omid'),
                        # if it has a doi we get the doi, otherwise we get the isbn - the right approach?
                        (pl.col('id').str.extract(r"((?:doi|isbn):[^\s]+)"))
                    )
                )

                lf_im = (
                    df
                    #.drop('id')
                    .drop_nulls('id')
                    .filter(
                        #~pl.all_horizontal(pl.all('doi', 'isbn').is_null()),
                        #pl.col("doi").is_not_null() | pl.col("isbn").is_not_null(),
                        pl.col("id").is_in(dois_isbns)
                    )
                    .collect(streaming=True)
                )

                lf_im_null = (
                    df
                    .filter(
                        pl.col("id").is_null()
                    )
                    .collect(streaming=True)
                )

                if not lf_im.is_empty():
                    lf_im.write_parquet(os.path.join(output_dir, os.path.basename(csv_file).replace('.csv', '.parquet')))

                if not lf_im_null.is_empty():
                    lf_im_null.write_parquet(os.path.join(output_dir2, os.path.basename(csv_file).replace('.csv', '.parquet')))


process_meta_zip(meta_path) # 25mins

100%|██████████| 28248/28248 [34:05<00:00, 13.81it/s]


### Title similarity

#### sparql

In [89]:
from string import Template
from SPARQLWrapper import SPARQLWrapper, CSV, JSON

query_template = Template("""
    PREFIX datacite: <http://purl.org/spar/datacite/>
    PREFIX dcterms: <http://purl.org/dc/terms/>
    PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
    PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>
    PREFIX fabio: <http://purl.org/spar/fabio/>

    SELECT ?id ?title {
            ?identifier literal:hasLiteralValue ?id.
            ?br datacite:hasIdentifier ?identifier;
        dcterms:title "{title}"
    }
    """)

In [None]:
sparql = SPARQLWrapper("https://test.opencitations.net/meta/sparql")

for el in tqdm(iris_noid_titles[63387:]):
    query = query_template.substitute(title=el)
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)
    ret = sparql.queryAndConvert()

    if ret['results']['bindings']:
        print(el, ret['results']['bindings'])

#### dask

In [91]:
iris_noid_titles = (
    df[['IDE_DOI', 'IDE_ISBN', 'TITLE']]
    .filter(
        pl.col('IDE_DOI').is_null() & pl.col('IDE_ISBN').is_null()
    )
)['TITLE'].to_list()

In [45]:
len(iris_noid_titles)

105327

In [9]:
zip_file = ZipFile('data/csv_openalex.zip')
csvs = ['zip://'+n for n in zip_file.namelist() if n.endswith('.csv')]

In [13]:
ddf = dd.read_csv(csvs, storage_options={'fo': zip_file.filename}, usecols=['title'])

In [30]:
ddf.partitions[0].head()

[########################################] | 100% Completed | 101.88 ms


Unnamed: 0,title
0,Expression Of Cyclin-Dependent Kinases Inhibit...
1,Effects Of Treatment Of Rats With Indole-3-Car...
2,Parathyroid Hormone-Related Peptide And Cardio...
3,Oral Immunisation Of Chickens Using Cholera To...
4,"Influence Of Stage Classification, Tumor Diffe..."


In [None]:
import numpy as np


def compare_title_similarity(title, list_):
    res = process.extractOne(title, list_, scorer=fuzz.token_sort_ratio, score_cutoff=70)
    return title, res


def find_fuzzy(list_from_user, list_from_DB, score_cutoff: int):
    score_matrix = process.cdist(
        list_from_user,
        list_from_DB,
        processor=str.lower,
        scorer=fuzz.ratio,
        dtype=np.uint8,  # Output the score as uint8, which is faster.
        workers=-1,  # Use multithreading. -1 means use all cores.
        score_cutoff=score_cutoff,
    )

    results = []
    user_indices, db_indices = np.nonzero(score_matrix)
    for user_index_of_match, db_index_of_match in zip(user_indices, db_indices):
        results.append(
            {
                "user_index_of_match": user_index_of_match,
                "db_index_of_match": db_index_of_match,
                "user_item_of_match": list_from_user[user_index_of_match],
                "db_item_of_match": list_from_DB[db_index_of_match],
                "score_of_match": score_matrix[user_index_of_match, db_index_of_match],
            }
        )
    return results


In [48]:
#def starts_with(df):


title_ddf = ddf.partitions[0].title.str.startswith('Expression').compute()
title_ddf

AttributeError: 'Series' object has no attribute 'query'

In [158]:
lf_iim_null = pl.scan_parquet('data/iris_in_meta_no_id/*.parquet')

#lf_iim_null.select(pl.len()).collect().head()
#lf_iim_null.collect().head(20)

lf_iim_null = (
    lf_iim_null
    .select('title')
    .filter(
        (pl.col('title') == "").not_()
    )
)

lf_iim_null.select(pl.len()).collect()

len
u32
12352039


In [177]:
116007 + 12352039

12468046

In [159]:
for title in iris_noid_titles:
    matches = process.extract(title, lf_iim_null['title'].to_list(), scorer=fuzz.token_sort_ratio, limit=1)
    if matches:
        print(title, matches[0])

TypeError: 'LazyFrame' object is not subscriptable (aside from slicing)

Use `select()` or `filter()` instead.

In [27]:
def compare_title_similarity(title):
    res = process.extractOne(title, l, scorer=fuzz.token_sort_ratio, score_cutoff=70)
    return title, res

In [5]:


lf_im_null_res = (
    lf_im_null
    .select(['title'])
    .with_columns(
        (pl.col('title').map_elements(compare_title_similarity, return_dtype=pl.Int64)).alias('title_similarity')
    )
    .drop_nulls('title_similarity')
)

NameError: name 'lf_im_null' is not defined

In [11]:
lf_im_null_res = (
    lf_im_null
    .select(['title'])
    .filter(
        pl.col('title') != ""
    )
)

#lf_im_null.select(pl.len()).collect().head()
l = lf_im_null_res.collect()['title']

In [12]:
type(l)

polars.series.series.Series

In [17]:
len(noid_titles_list)

105327

In [22]:
from tqdm.contrib.concurrent import process_map


core_num = multiprocessing.cpu_count()

r = process_map(compare_title_similarity, noid_titles_list[:10], max_workers=core_num, chunksize=1)

print(r)


  0%|          | 0/10 [00:00<?, ?it/s]

[('Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures', ('[The Interferon System: Structure, Biology, Applications].', 58.33333333333333, 4574416)), ('Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria', None), ('Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni', ('Transformation Consistent Self-ensembling Model For Semi-supervised Medical Image Segmentation', 51.08695652173913, 5867090)), ("Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea", None), ('2005: agricoltura centro o fine del mondo?', ('Agricultural Policy Monitoring And Evaluation 2020', 54.347826086956516, 5804189)), ('Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista', None), ('Phaseguide Structures for Pipette Actuated Laminar Flow Based Selective Sample Re

In [23]:
r

[('Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures',
  ('[The Interferon System: Structure, Biology, Applications].',
   58.33333333333333,
   4574416)),
 ('Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria',
  None),
 ('Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni',
  ('Transformation Consistent Self-ensembling Model For Semi-supervised Medical Image Segmentation',
   51.08695652173913,
   5867090)),
 ("Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea",
  None),
 ('2005: agricoltura centro o fine del mondo?',
  ('Agricultural Policy Monitoring And Evaluation 2020',
   54.347826086956516,
   5804189)),
 ('Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista',
  None),
 ('Phaseguide Structures for Pipette Actuated La

In [20]:
noid_titles_list[:10]

['Applications of CeCl3.7H2O-NaI System Towards the Formation of Heterocyclic Structures',
 'Radioattività naturale delle materie prime ceramiche e relative condizioni di esposizione occupazionale nell’industria',
 'Gli istituti generali di semplificazione: la conferenza di servizi e le sue trasformazioni',
 "Commento all’art.8 d.lgs. 154/2004 - Procedimenti ai sensi dell'articolo 88 del Trattato istitutivo della Comunitaà europea",
 '2005: agricoltura centro o fine del mondo?',
 'Aspetti emotivi e comportamentali dell’improvvisazione jazzistica: il punto di vista del musicista',
 'Phaseguide Structures for Pipette Actuated Laminar Flow Based Selective Sample Recovery',
 'Studio del ciclo biologico di Myxobolus lentisuturalis (Myxozoa, Myxobolidae), parassita di Carassius auratus auratus.',
 'The Other Side of the Timing Equation: a Result of Clock Faults',
 'Ritratto di Augusto capite velato']

In [19]:
l[4574416]

'[The Interferon System: Structure, Biology, Applications].'

### RQ1. What is the coverage of the publications available in IRIS (strictly concerning research conducted within the University of Bologna) in OpenCitations Meta?

In [101]:
lf_iim = pl.scan_parquet('data/iris_in_meta/*.parquet')
#lf_iim.head().collect()
lf_iim.select(pl.len()).collect()

len
u32
116007


In [102]:
rq1 = lf_iim.select(pl.len()).collect()

print(rq1.item(), 'articoli provenienti da IRIS su', len(dois_isbns), 'sono presenti in OM')
print("{:.1f}%".format(rq1.item() / len(dois_isbns) * 100))

116007 articoli provenienti da IRIS su 179359 sono presenti in OM
64.7%


In [103]:
ids_list = (
    lf_iim
    .select('id')
)

ids_list.head().collect()

id
str
"""doi:10.1002/widm.1511"""
"""doi:10.1016/j.jmatprotec.2017.10.019"""
"""doi:10.1093/annonc/mdw618"""
"""doi:10.1093/annonc/mdw430"""
"""doi:10.1093/annonc/mdw649"""


### RQ2. Which are the types of publications that are better covered in OpenCitations Meta?

In [7]:
rq2 = lf_iim.group_by('type').len().sort('len', descending=True).collect()

rq2

type,len
str,u32
"""journal article""",102828
"""proceedings article""",5584
"""book chapter""",4473
"""book""",1487
"""""",1420
…,…
"""dataset""",6
"""dissertation""",2
"""computer program""",1
"""book series""",1


### Index to Parquet

In [15]:
omids_list = (
    lf_iim
    .select('omid')
    .collect()
)['omid'].to_list()

len(omids_list)

116007

In [16]:
index_path = Path('data/oc_index')
file_names = [Path(index_path / archive) for archive in os.listdir(index_path)]

In [27]:
c = 0
for archive in tqdm(file_names):
    print(archive)
    zip_file = ZipFile(archive)

    csvs = ['zip://'+n for n in zip_file.namelist() if n.endswith('.csv')]

    c += len(csvs)
    ddf = dd.read_csv(csvs, storage_options={'fo': zip_file.filename}, usecols=['id', 'citing', 'cited'])
    ddf = ddf[ddf['cited'].isin(omids_list) | ddf['citing'].isin(omids_list)]
    ddf.to_parquet('data/index_in_iris/' + zip_file.filename + '.parquet', write_index=False) # 154 mins

  0%|          | 0/25 [00:00<?, ?it/s]

data/oc_index/2023_08_03-05T_0_1.zip
[########################################] | 100% Completed | 10m 6ss


  4%|▍         | 1/25 [10:06<4:02:40, 606.67s/it]

data/oc_index/2023_11_25T_0_1.zip
[########################################] | 100% Completed | 38.91 s


  8%|▊         | 2/25 [10:46<1:44:40, 273.07s/it]

data/oc_index/2023_08_20-23T_0_1.zip
[########################################] | 100% Completed | 236.61 s


 12%|█▏        | 3/25 [14:43<1:34:04, 256.56s/it]

data/oc_index/2023_06_05-07T_0_1.zip
[########################################] | 100% Completed | 162.50 s


 16%|█▌        | 4/25 [17:26<1:16:51, 219.57s/it]

data/oc_index/2023_07_13T_0_1.zip
[########################################] | 100% Completed | 80.52 s


 20%|██        | 5/25 [18:46<56:31, 169.55s/it]  

data/oc_index/2023_05_09-14T_0_1.zip
[########################################] | 100% Completed | 10m 43s


 24%|██▍       | 6/25 [29:30<1:44:47, 330.90s/it]

data/oc_index/2023_08_14-16T_0_1.zip
[########################################] | 100% Completed | 461.42 s


 28%|██▊       | 7/25 [37:12<1:52:06, 373.69s/it]

data/oc_index/2023_06_15-18T_0_1.zip
[########################################] | 100% Completed | 485.30 s


 32%|███▏      | 8/25 [45:18<1:55:58, 409.32s/it]

data/oc_index/2023_07-08_31-02T_0_1.zip
[########################################] | 100% Completed | 549.97 s


 36%|███▌      | 9/25 [54:28<2:00:54, 453.38s/it]

data/oc_index/2023_07_17-20T_0_1.zip
[########################################] | 100% Completed | 10m 11s


 40%|████      | 10/25 [1:04:40<2:05:33, 502.26s/it]

data/oc_index/2023_06_04-07T_0_1.zip
[########################################] | 100% Completed | 70.17 s


 44%|████▍     | 11/25 [1:05:50<1:26:21, 370.12s/it]

data/oc_index/2023_09_27T_0_1.zip
[########################################] | 100% Completed | 103.17 ms


 48%|████▊     | 12/25 [1:05:51<55:49, 257.63s/it]  

data/oc_index/2023_05_18-20T_0_1.zip
[########################################] | 100% Completed | 237.38 s


 52%|█████▏    | 13/25 [1:09:48<50:18, 251.58s/it]

data/oc_index/2023_08_08-11T_0_1.zip
[########################################] | 100% Completed | 10m 5ss


 56%|█████▌    | 14/25 [1:19:54<1:05:44, 358.55s/it]

data/oc_index/2023_07_26-29T_0_1.zip
[########################################] | 100% Completed | 10m 55s


 60%|██████    | 15/25 [1:30:50<1:14:42, 448.29s/it]

data/oc_index/2023_05_20-21T_0_1.zip
[########################################] | 100% Completed | 57.86 s


 64%|██████▍   | 16/25 [1:31:49<49:37, 330.86s/it]  

data/oc_index/2023_07_05-08T_0_1.zip
[########################################] | 100% Completed | 472.23 s


 68%|██████▊   | 17/25 [1:39:41<49:47, 373.47s/it]

data/oc_index/2023_08_06-08T_0_1.zip
[########################################] | 100% Completed | 540.19 s


 72%|███████▏  | 18/25 [1:48:42<49:25, 423.65s/it]

data/oc_index/2023_07_24-26T_0_1.zip
[########################################] | 100% Completed | 496.67 s


 76%|███████▌  | 19/25 [1:56:59<44:34, 445.69s/it]

data/oc_index/2023_10_24-28T_0_1.zip
[########################################] | 100% Completed | 491.88 s


 80%|████████  | 20/25 [2:05:11<38:19, 459.86s/it]

data/oc_index/2023_05_22-31T_0_1.zip
[########################################] | 100% Completed | 118.35 s


 84%|████████▍ | 21/25 [2:07:10<23:49, 357.44s/it]

data/oc_index/2023_06_19-27T_0_1.zip
[########################################] | 100% Completed | 19m 2ss


 88%|████████▊ | 22/25 [2:26:12<29:39, 593.01s/it]

data/oc_index/2023_08_17-19T_0_1.zip
[########################################] | 100% Completed | 196.92 s


 92%|█████████▏| 23/25 [2:29:30<15:48, 474.25s/it]

data/oc_index/2023_08_24-27T_0_1.zip
[########################################] | 100% Completed | 37.13 s


 96%|█████████▌| 24/25 [2:30:07<05:43, 343.18s/it]

data/oc_index/2023_06_13-14T_0_1-10.zip
[########################################] | 100% Completed | 248.34 s


100%|██████████| 25/25 [2:34:16<00:00, 370.25s/it]


### RQ 3. What is the amount of citations (according to OpenCitations Index) included in the IRIS publications that are involved in OpenCitations Meta (as citing entity and as cited entity)?

In [104]:
iii_path = Path('data/index_in_iris/')

iii_glob = glob.glob(str(iii_path / '*' / '*.parquet'))

In [105]:
lf_iii = pl.scan_parquet(iii_glob)
print(lf_iii.select(pl.len()).collect())
lf_iii.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 7859226 │
└─────────┘


id,citing,cited
str,str,str
"""oci:062102010944-06403538172""","""omid:br/062102010944""","""omid:br/06403538172"""
"""oci:062102010936-061901873468""","""omid:br/062102010936""","""omid:br/061901873468"""
"""oci:061302445980-0620582911""","""omid:br/061302445980""","""omid:br/0620582911"""
"""oci:062103078814-061903865447""","""omid:br/062103078814""","""omid:br/061903865447"""
"""oci:06403471424-06703722896""","""omid:br/06403471424""","""omid:br/06703722896"""


In [107]:
oc_omids_list = (
    lf_iim
    .select('omid')
    .collect()
)['omid'].to_list()

len(oc_omids_list)

116007

In [46]:
rq3a = (
    lf_iii
    .select('citing')
    .unique()
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

citing_list = rq3a.collect()['citing'].to_list()

rq3a.head().collect()
rq3a.select(pl.len()).collect()

len
u32
107704


In [47]:
rq3b = (
    lf_iii
    .select('cited')
    .unique()
    .filter(
        pl.col('cited').is_in(oc_omids_list)
    )
)

cited_list = rq3b.collect()['cited'].to_list()

rq3b.head().collect()
rq3b.select(pl.len()).collect()

len
u32
105135


In [79]:
cit_coming_from_iris = (
    lf_iii
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

print(cit_coming_from_iris.select(pl.len()).collect())

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 4295973 │
└─────────┘


In [None]:
rq3 = (

)

In [54]:
rq3.select(pl.len()).collect()

len
u32
356419


### RQ 4. How many of these citations come from and go to publications that are not included in IRIS?

In [52]:
len(oc_omids_list)

116007

In [117]:
rq4a = (
    lf_iii
    .select('citing')
    #.unique('cited') ???
    .filter(
        ~pl.col('citing').is_in(oc_omids_list)
    )
)

In [118]:
print(rq4a.select(pl.len()).collect())
rq4a.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3563253 │
└─────────┘


citing
str
"""omid:br/062102010944"""
"""omid:br/062102010936"""
"""omid:br/061302445980"""
"""omid:br/062103078814"""
"""omid:br/0660213594"""


In [None]:
rq4b = (
    lf_iii
    .select('cited')
    .filter(
        ~pl.col('cited').is_in(oc_omids_list)
    )
)

In [116]:
print(rq4b.select(pl.len()).collect())
rq4b.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3939554 │
└─────────┘


cited
str
"""omid:br/06703722896"""
"""omid:br/06703722887"""
"""omid:br/06703722545"""
"""omid:br/06703723039"""
"""omid:br/06803732245"""


### RQ 5. How many of these citations involve publications in IRIS as both citing and cited entities?

In [129]:
rq5a = (
    lf_iii
    .select('citing')
    .filter(
        pl.col('citing').is_in(oc_omids_list)
    )
)

In [130]:
print(rq5a.select(pl.len()).collect())
rq5a.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 4295973 │
└─────────┘


citing
str
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""
"""omid:br/06403471424"""


In [121]:
rq5b = (
    lf_iii
    .select('cited')
    .filter(
        pl.col('cited').is_in(oc_omids_list)
    )
)

In [122]:
print(rq5b.select(pl.len()).collect())
rq5b.head().collect()

shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 3919672 │
└─────────┘


cited
str
"""omid:br/06403538172"""
"""omid:br/061901873468"""
"""omid:br/0620582911"""
"""omid:br/061903865447"""
"""omid:br/06502545532"""


In [133]:
rq5 = (
    lf_iii
    .select('citing', 'cited')
    .filter(
        pl.col('citing').is_in(oc_omids_list) & pl.col('cited').is_in(oc_omids_list)
    )
)

In [134]:
print(rq5.select(pl.len()).collect())
rq5.head().collect()

shape: (1, 1)
┌────────┐
│ len    │
│ ---    │
│ u32    │
╞════════╡
│ 356419 │
└────────┘


citing,cited
str,str
"""omid:br/06801833332""","""omid:br/06301833698"""
"""omid:br/06801833332""","""omid:br/06201835146"""
"""omid:br/06801833332""","""omid:br/06101850825"""
"""omid:br/06140651081""","""omid:br/061301899117"""
"""omid:br/061902028230""","""omid:br/06230928593"""
