# update5 - TBD

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip
import pickle

In [4]:
import pandas as pd
import numpy as np

In [5]:
import rapidfuzz

In [6]:
from app import db
from models import Source, ISSNtoISSNL
from models.source import DELETED_SOURCE_ID
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

140473786356096: loading merged_into_institutions_dict
140473786356096: loading valid concept IDs
140473786356096: loading valid topic IDs


In [7]:
from cleanup.util import make_request, paginate_openalex


In [8]:
%%time
sq = """select if.*, ife.cluster_title from issn_audit_20240321.issn_ic_datafile_202402 if
left join issn_audit_20240321.issn_ic_datafile_expanded_202402 ife
  on if."submitted_1348-0278"  = ife.issns;"""
df_issnl_file = pd.read_sql_query(sq, db.engine)

CPU times: user 738 ms, sys: 203 ms, total: 941 ms
Wall time: 2.12 s


In [9]:
df_issnl_file = df_issnl_file.rename(columns={"submitted_1348-0278": "submitted_issn"})

In [10]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.81 s, sys: 66.5 ms, total: 1.88 s
Wall time: 3.42 s


In [11]:
print(len(df_midjournal))
df_midjournal.dropna(subset='issn', inplace=True)
print(len(df_midjournal))

258555
145308


In [12]:
smap = {}
for source_id, issn_list in tqdm(df_midjournal.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal)):
    for issn in issn_list:
        if issn in smap:
            smap[issn].append(source_id)
        else:
            smap[issn] = [source_id]

  0%|          | 0/145308 [00:00<?, ?it/s]

100%|██████████| 145308/145308 [00:00<00:00, 1269371.60it/s]


In [13]:
d = []
for issn, source_list in smap.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()
df_issnl_file['num_sources_resolve'] = df_issnl_file['submitted_issn'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])
df_issnl_file['num_sources_resolve'].fillna(value=0, inplace=True)

In [14]:
# actual_works_count = []
# subset = {issn: source_list for issn, source_list in smap.items() if len(source_list) > 1}
# for issn, source_list in tqdm(subset.items()):
#     if len(source_list) > 1:
#         for source_id in source_list:
#             url = f'https://api.openalex.org/works?filter=locations.source.id:S{source_id}'
#             params = {'mailto': 'jportenoy@ourresearch.org',
#                       'select': 'id',
#                       'per-page': 1}
#             r = make_request(url, params=params)
#             this_c = r.json()['meta']['count']
#             actual_works_count.append({
#                 'issn': issn,
#                 'source_id': source_id,
#                 'works_count': this_c,
#             })
# df_actual_works_count = pd.DataFrame(actual_works_count)

In [15]:
df_actual_works_count = pd.read_pickle('../data/issn_audit_20240301/df_actual_works_count.pickle')

In [16]:
x = df_actual_works_count[['source_id', 'works_count']].drop_duplicates()
x = x.set_index('source_id', verify_integrity=True)['works_count']
df_midjournal['works_count'] = df_midjournal['journal_id'].map(x)

In [21]:
%%time
url = "https://api.openalex.org/sources"
params = {
    'mailto': 'jportenoy@ourresearch.org',
    'group_by': 'issn',
}
data = []
for r in paginate_openalex(url, params=params):
    data.extend(r.json()['group_by'])

CPU times: user 8.29 s, sys: 295 ms, total: 8.58 s
Wall time: 3min 8s


In [22]:
df_sources = pd.DataFrame(data)
_rename = {
    'key': 'issn',
    'count': 'num_sources_in_openalex',
}
df_openalex_issn_sources_count = df_sources.rename(columns=_rename).drop(columns=['key_display_name'])

In [23]:
df_openalex_issn_sources_count['num_sources_in_openalex'].value_counts()

num_sources_in_openalex
1    209737
2      8239
3       149
4         6
5         2
Name: count, dtype: int64

In [17]:
df_issn_to_issnl = pd.read_sql_query("""select * from mid.journal_issn_to_issnl""", db.engine)

In [18]:
issn_to_issnl = df_issn_to_issnl.set_index('issn', verify_integrity=True)['issnl']

In [22]:
issnls = issn_to_issnl.unique()

In [19]:
def get_all_sources_db(issn, session):
    sources1 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter_by(issn=issn).all()
    sources2 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter(Source.issns.contains(issn)).all()
    return set(sources1 + sources2)


In [20]:
# try:
#     dups_data = {}
#     num_sources_verify = []
#     works_count_map = df_midjournal.set_index('journal_id', verify_integrity=True)['works_count']
#     for issn, row in tqdm(df_mult.set_index('issn', verify_integrity=True).iterrows(), total=len(df_mult)):
#         this_issn_data = []
#         sources = get_all_sources_db(issn, db.session)
#         num_sources_verify.append({
#             'issn': issn,
#             'num_sources_in_openalex': row['num_sources_in_openalex'],
#             'num_sources_db': len(sources),
#         })
#         for source in sources:
#             fuzzratio = rapidfuzz.fuzz.ratio(source.display_name, row['title'], processor=rapidfuzz.utils.default_process)
#             this_issn_data.append({
#                 'source_id': source.id,
#                 'fuzzratio': fuzzratio,
#                 'works_count': works_count_map[source.id]
#             })
#         dups_data[issn] = this_issn_data
# finally:
#     db.session.close()

In [21]:
fp = Path('../data/issn_audit_20240301/dups_data.pickle')
dups_data = pickle.loads(fp.read_bytes())

In [25]:
issnldups = df_midjournal.dropna(subset='issn')
issnldups = issnldups[issnldups['issn'].duplicated(keep=False)]
print(len(issnldups))
issnldups = issnldups[issnldups['issn'].isin(issnls)]
print(len(issnldups))
print(issnldups['issn'].nunique())
print(issnldups['journal_id'].nunique())

3756
3736
1823
3736


In [113]:
#go through each of these above (groupby issn, which should be the issnl), and dedup. delete any which aren't issnls (maybe check them for data later).
#add missing sources from data file (any issnl that doesn't have a source_id identified by issnl)
#this should leave us with no duplicate issns
#then figure out how to resolve any issns that resolve to multiple

just merge all of them

In [30]:
%%time
try:
    num_updated = 0
    n_issn = issnldups['issn'].nunique()
    no_dd_found = []
    not_actually_dups = []
    for issn, gbdf in tqdm(issnldups.set_index('journal_id', verify_integrity=True).groupby('issn'), total=n_issn):
        now = datetime.now(timezone.utc).isoformat()
        try:
            dd = dups_data[issn]
        except KeyError:
            no_dd_found.append(issn)
            continue
        if len(dd) < 2:
            # come back to these
            not_actually_dups.append(issn)
            continue
        # sort by [works_count:desc, source_id:asc]
        dd.sort(key=lambda x: x['source_id'])
        dd.sort(key=lambda x: x['works_count'], reverse=True)

        merge_into_id = dd[0]['source_id']
        for item in dd[1:]:
            id_to_merge = item['source_id']
            source = db.session.query(Source).filter_by(journal_id=id_to_merge).one()
            source.merge_into_id = merge_into_id
            source.merge_into_date = now
            source.updated_date = now

            db.session.add(source)

            note = f"merged because duplicate issnl."
            row = gbdf.loc[id_to_merge].fillna(value=0)
            if row['publisher_id'] or row['alternate_titles']:
                note += " may have additional info."

            sq = """INSERT INTO issn_audit_20240321.update5
                    (source_id, updated_date, merge_into_id, note)
                    VALUES(:source_id, :now, :merge_into_id, :note);"""
            db.session.execute(text(sq), {
                'source_id': source.id,
                'now': now,
                'merge_into_id': merge_into_id,
                'note': note,
            })

            sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
                where "submitted_1348-0278" = :issn"""
            db.session.execute(text(sq), {
                'issn': issn,
            })

            num_updated += 1
    db.session.commit()
    print(f"{num_updated} updated")
finally:
    db.session.close

  0%|          | 0/1823 [00:00<?, ?it/s]

100%|██████████| 1823/1823 [19:02<00:00,  1.60it/s]


1897 updated
CPU times: user 38 s, sys: 1.02 s, total: 39.1 s
Wall time: 19min 41s


In [32]:

no_dd_found

['0287-3478',
 '0342-5932',
 '0387-1185',
 '0392-5005',
 '0393-134X',
 '0716-2006',
 '0767-709X',
 '0950-5571',
 '1025-3076',
 '1121-4074',
 '1437-9309',
 '1727-1584',
 '1996-0042',
 '2084-7998',
 '2521-7119']

In [33]:
not_actually_dups

[]

In [34]:
no_dd_sources = {}
for issn in tqdm(no_dd_found):
    no_dd_sources[issn] = get_all_sources_db(issn, db.session)

100%|██████████| 15/15 [00:02<00:00,  6.64it/s]


In [35]:
no_dd_sources

{'0287-3478': {<Source ( http://localhost:5007/S2764753162?apiurls ) 2764753162 Crustacean research>,
  <Source ( http://localhost:5007/S4386621722?apiurls ) 4386621722 Crustacean Research>},
 '0342-5932': {<Source ( http://localhost:5007/S4306535007?apiurls ) 4306535007 Würzburger Jahrbücher für die Altertumswissenschaft>,
  <Source ( http://localhost:5007/S4393914634?apiurls ) 4393914634 Würzburger Jahrbücher für die Altertumswissenschaft (Internet)>},
 '0387-1185': {<Source ( http://localhost:5007/S2764951900?apiurls ) 2764951900 Transactions of the Architectural Institute of Japan>,
  <Source ( http://localhost:5007/S4386621727?apiurls ) 4386621727 Transactions of the Architectural Institute of Japan>},
 '0392-5005': {<Source ( http://localhost:5007/S4306533961?apiurls ) 4306533961 URBANISTICA INFORMAZIONI>,
  <Source ( http://localhost:5007/S4393915153?apiurls ) 4393915153 Urbanistica informazioni (Online)>},
 '0393-134X': {<Source ( http://localhost:5007/S4306518029?apiurls ) 430