# update3: similar to update1, but check for merge_into_id this time

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip

In [4]:
import pandas as pd
import numpy as np

In [5]:
from app import db
from models import Source, ISSNtoISSNL
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

140357365961088: loading merged_into_institutions_dict
140357365961088: loading valid concept IDs
140357365961088: loading valid topic IDs


In [6]:
def get_source_from_db(issn, session) -> Source:
    source = session.query(Source).options(Load(Source).lazyload('*')).filter_by(issn=issn).filter_by(merge_into_id=None).one_or_none()
    if source is None:
        source = session.query(Source).options(Load(Source).lazyload('*')).filter(Source.issns.contains(issn)).filter_by(merge_into_id=None).one_or_none()
    return source


In [7]:
sq = """select issn, issnl from mid.journal_issn_to_issnl"""
issn_to_issnl = pd.read_sql_query(sq, db.engine)
# issn_to_issnl = issn_to_issnl.set_index('issn', verify_integrity=True)['issnl']
len(issn_to_issnl)

213489

In [8]:
# get dict of issnl to issn
x = issn_to_issnl.dropna(subset=['issnl'])
# x = x[x['category'].isin(['Register', 'Work', 'Free', 'Validation Request'])]
x = x[['issn', 'issnl']]
issnl_to_issn = {}
ignore = [
    '0000-0000',
]
x = x[~(x['issnl'].isin(ignore))]
for name, gbdf in x.groupby('issnl'):
    if name in ignore:
        continue
    issnl_to_issn[name] = gbdf['issn'].tolist()
len(issnl_to_issn)

141006

In [9]:
%%time
sq = """select issnl from issn_audit_20240321.update1_issnl_multfound"""
multfound = pd.read_sql_query(sq, db.engine)

CPU times: user 3.82 ms, sys: 873 µs, total: 4.7 ms
Wall time: 133 ms


In [10]:
%%time
# get one title per issnl
df_issn_ic = pd.read_sql_query("""select issn, issnl, title from issn_audit_20240321.issn_ic_datafile_202402""", db.engine)
issnl_display_name = {}
lookup = df_issn_ic.set_index('issn')['title']
for issnl in issnl_to_issn:
    try:
        issnl_display_name[issnl] = lookup.loc[issnl]
    except KeyError:
        issnl_display_name[issnl] = df_issn_ic[df_issn_ic['issnl']==issnl].iloc[0]['title']

CPU times: user 8.64 s, sys: 85.3 ms, total: 8.72 s
Wall time: 9.17 s


In [11]:
def update_source(issnl, issn_list, title, updated_date=None, session=None):
    if session is None:
        session = db.session
    if updated_date is None:
        updated_date = datetime.now(timezone.utc).isoformat()
    source = get_source_from_db(issnl, session)
    if source is None:
        raise KeyError(f"issnl {issnl} not found in db")
    old_issnl = source.issn
    old_display_name = source.display_name
    old_issns = source.issns

    # make updates
    if title:
        source.display_name = title
    source.issn = issnl
    source.issns = json.dumps(issn_list)
    source.issns_text_array = issn_list
    # source.type = source_type
    source.updated_date = updated_date
    session.add(source)

    # insert into log table
    sq = """INSERT INTO issn_audit_20240321.update3
        (issnl, old_issns, new_issns, old_issnl, old_display_name, new_display_name, updated_date, source_id)
        VALUES(:issnl, :old_issns, :new_issns, :old_issnl, :old_display_name, :new_display_name, :updated_date, :source_id);
        """
    session.execute(text(sq), {
        'issnl': issnl,
        'old_issns': old_issns,
        'new_issns': json.dumps(issn_list),
        'old_issnl': old_issnl,
        'old_display_name': old_display_name,
        'new_display_name': title if title else old_display_name,
        'updated_date': updated_date,
        'source_id': source.id,
    }) 

In [12]:
try:
    # for issnl, issn_list in issnl_to_issn.items():
    # for issnl, issn_list in tqdm(issnl_to_issn.items(), total=len(issnl_to_issn)):
    for issnl in tqdm(multfound['issnl'].values, total=len(multfound)):
        now = datetime.now(timezone.utc).isoformat()
        issn_list = issnl_to_issn[issnl]
        title = issnl_display_name[issnl]
        try:
            if issnl not in issn_list:
                issn_list.append(issnl)
            # move issnl to the front
            issn_list.insert(0, issn_list.pop(issn_list.index(issnl)))
            update_source(issnl, issn_list, title, updated_date=now, session=db.session)
            for issn in issn_list:
                sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
                    where issn = :issn
                    """
                db.session.execute(text(sq), {
                    'issn': issn,
                })
        except KeyError:
            continue
        except MultipleResultsFound:
            continue

    db.session.commit()
finally:
    db.session.close()

  0%|          | 0/4528 [00:00<?, ?it/s]

100%|██████████| 4528/4528 [02:10<00:00, 34.57it/s]
