# update8 - TBD

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip
import pickle

In [4]:
import pandas as pd
import numpy as np

In [5]:
import rapidfuzz

In [6]:
from app import db
from models import Source, ISSNtoISSNL
from models.source import DELETED_SOURCE_ID
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

140119199445376: loading merged_into_institutions_dict
140119199445376: loading valid concept IDs
140119199445376: loading valid topic IDs
140119199445376: loading valid keyword IDs


In [7]:
from cleanup.util import make_request, paginate_openalex


In [8]:
%%time
sq = """select if.*, ife.cluster_title from issn_audit_20240321.issn_ic_datafile_202402 if
left join issn_audit_20240321.issn_ic_datafile_expanded_202402 ife
  on if."submitted_1348-0278"  = ife.issns;"""
df_issnl_file = pd.read_sql_query(sq, db.engine)

CPU times: user 698 ms, sys: 138 ms, total: 836 ms
Wall time: 1.96 s


In [9]:
df_issnl_file = df_issnl_file.rename(columns={"submitted_1348-0278": "submitted_issn"})

In [10]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null
    and issn is not null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.06 s, sys: 76.2 ms, total: 1.14 s
Wall time: 1.76 s


In [11]:
print(len(df_midjournal))
df_midjournal.dropna(subset='issn', inplace=True)
print(len(df_midjournal))

141175
141175


Are there duplicate ISSN-Ls in `mid.journal`?

In [12]:
num_duplicates = df_midjournal['issn'].duplicated().sum()
print(f"There are {num_duplicates} duplicates")

There are 0 duplicates


:party:

Does every ISSN-L in `mid.journal` have the same ISSN as the first item in the ISSN list?

In [13]:
%%time

def _is_first_in_issns(row):
    issnl = row['issn']
    issn_list = json.loads(row['issns'])
    return issnl == issn_list[0]

def _is_first_in_issns_text_array(row):
    issnl = row['issn']
    issn_list = row['issns_text_array']
    return issnl == issn_list[0]

df_midjournal['issnl_is_first_in_issns'] = df_midjournal.apply(_is_first_in_issns, axis=1)
df_midjournal['issnl_is_first_in_issns_text_array'] = df_midjournal.apply(_is_first_in_issns_text_array, axis=1)

CPU times: user 1.37 s, sys: 6.6 ms, total: 1.37 s
Wall time: 1.37 s


In [14]:
df_midjournal['issnl_is_first_in_issns'].value_counts()

issnl_is_first_in_issns
True    141175
Name: count, dtype: int64

In [15]:
df_midjournal['issnl_is_first_in_issns_text_array'].value_counts()

issnl_is_first_in_issns_text_array
True    141175
Name: count, dtype: int64

:party:

In [16]:
df_issn_to_issnl = pd.read_sql_query("""select * from mid.journal_issn_to_issnl""", db.engine)

In [17]:
issn_to_issnl = df_issn_to_issnl.set_index('issn', verify_integrity=True)['issnl']

In [18]:
issnls = issn_to_issnl.unique()

In [19]:
# get dict of issnl to issn
x = df_issn_to_issnl.dropna(subset=['issnl'])
# x = x[x['category'].isin(['Register', 'Work', 'Free', 'Validation Request'])]
x = x[['issn', 'issnl']]
issnl_to_issn = {}
ignore = [
    '0000-0000',
]
x = x[~(x['issnl'].isin(ignore))]
for name, gbdf in x.groupby('issnl'):
    if name in ignore:
        continue
    issnl_to_issn[name] = gbdf['issn'].tolist()
len(issnl_to_issn)

141008

Does every ISSN resolve to at most one Source?

In [20]:
smap = {}
for source_id, issn_list in tqdm(df_midjournal.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal)):
    for issn in issn_list:
        if issn in smap:
            smap[issn].append(source_id)
        else:
            smap[issn] = [source_id]

  0%|          | 0/141175 [00:00<?, ?it/s]

100%|██████████| 141175/141175 [00:00<00:00, 336396.71it/s]


In [21]:
d = []
for issn, source_list in smap.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()
df_issnl_file['num_sources_resolve'] = df_issnl_file['submitted_issn'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])
df_issnl_file['num_sources_resolve'].fillna(value=0, inplace=True)

In [22]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, merge_into_id, merge_into_date
    from mid.journal
    ;
    """
df_midjournal_withmerged = pd.read_sql_query(sq, db.engine)

CPU times: user 1.23 s, sys: 110 ms, total: 1.34 s
Wall time: 1.99 s


In [23]:
smap_withmerged = {}
for source_id, issn_list in tqdm(df_midjournal_withmerged.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal_withmerged)):
    if issn_list:
        for issn in issn_list:
            if issn in smap_withmerged:
                smap_withmerged[issn].append(source_id)
            else:
                smap_withmerged[issn] = [source_id]

100%|██████████| 263991/263991 [00:00<00:00, 495400.39it/s] 


In [24]:
d = []
for issn, source_list in smap_withmerged.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()
df_issnl_file['num_sources_resolve_withmerged'] = df_issnl_file['submitted_issn'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])
df_issnl_file['num_sources_resolve_withmerged'].fillna(value=0, inplace=True)

In [25]:
df_issnl_file['num_sources_resolve'].value_counts()

num_sources_resolve
1.0    213253
0.0      1353
Name: count, dtype: int64

In [26]:
df_issnl_file['num_sources_resolve_withmerged'].value_counts()

num_sources_resolve_withmerged
1.0    204936
2.0      8881
0.0       552
3.0       215
6.0        11
4.0         9
5.0         2
Name: count, dtype: int64

In [27]:
# all remaining ones are not ISSNLs
# let's deal with only the valid ISSNs
df_to_add = df_issnl_file[df_issnl_file['num_sources_resolve']==0]
df_to_add = df_to_add[(df_to_add['submitted_issn'] == df_to_add['issn'])]
print(len(df_to_add))

1126


In [28]:
(df_to_add['submitted_issn'] == df_to_add['issn']).value_counts()

True    1126
Name: count, dtype: int64

In [29]:
df_to_add['num_sources_resolve_withmerged'].value_counts()

num_sources_resolve_withmerged
1.0    710
0.0    395
2.0     21
Name: count, dtype: int64

In [30]:
d = []
midjournal_lookup = df_midjournal_withmerged.set_index('journal_id', verify_integrity=True)
for submitted_issn, row in df_to_add[df_to_add['num_sources_resolve_withmerged']>=1].set_index('submitted_issn', verify_integrity=True).iterrows():
    this_source_ids = smap_withmerged[submitted_issn]
    for source_id in this_source_ids:
        midjournal_row = midjournal_lookup.loc[source_id]
        d.append({
            'submitted_issn': submitted_issn,
            'source_id': source_id,
            'cluster_title': row['cluster_title'],
            'display_name': midjournal_row['display_name'],
            'midjournal_issn': midjournal_row['issn'],
            'midjournal_issns': midjournal_row['issns'],
            'midjournal_issns_text_array': midjournal_row['issns_text_array'],
            'midjournal_type': midjournal_row['type'],
            'merge_into_id': midjournal_row['merge_into_id'],
            'merge_into_date': midjournal_row['merge_into_date'],
            'num_sources_resolve_withmerged': row['num_sources_resolve_withmerged'],
        })
df_unmerge = pd.DataFrame(d)
df_unmerge['deleted'] = df_unmerge['merge_into_id'] == DELETED_SOURCE_ID


In [31]:
df_to_add['issnl_lookup'] = df_to_add['issn'].map(issn_to_issnl)

In [32]:
df_to_add['issnl'].isna().value_counts()

issnl
True     1098
False      28
Name: count, dtype: int64

In [33]:
df_to_add['issnl_lookup'].isna().value_counts()

issnl_lookup
True     1117
False       9
Name: count, dtype: int64

I think we're done