# update5 - TBD

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip
import pickle

In [4]:
import pandas as pd
import numpy as np

In [5]:
import rapidfuzz

In [6]:
from app import db
from models import Source, ISSNtoISSNL
from models.source import DELETED_SOURCE_ID
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

139824131264896: loading merged_into_institutions_dict
139824131264896: loading valid concept IDs
139824131264896: loading valid topic IDs
139824131264896: loading valid keyword IDs


In [7]:
from cleanup.util import make_request, paginate_openalex


In [8]:
%%time
sq = """select if.*, ife.cluster_title from issn_audit_20240321.issn_ic_datafile_202402 if
left join issn_audit_20240321.issn_ic_datafile_expanded_202402 ife
  on if."submitted_1348-0278"  = ife.issns;"""
df_issnl_file = pd.read_sql_query(sq, db.engine)

CPU times: user 571 ms, sys: 171 ms, total: 743 ms
Wall time: 1.94 s


In [9]:
df_issnl_file = df_issnl_file.rename(columns={"submitted_1348-0278": "submitted_issn"})

In [10]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.49 s, sys: 160 ms, total: 1.65 s
Wall time: 2.97 s


In [11]:
print(len(df_midjournal))
df_midjournal.dropna(subset='issn', inplace=True)
print(len(df_midjournal))

256658
143411


In [12]:
smap = {}
for source_id, issn_list in tqdm(df_midjournal.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal)):
    for issn in issn_list:
        if issn in smap:
            smap[issn].append(source_id)
        else:
            smap[issn] = [source_id]

  0%|          | 0/143411 [00:00<?, ?it/s]

100%|██████████| 143411/143411 [00:00<00:00, 486816.76it/s]


In [13]:
d = []
for issn, source_list in smap.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()
df_issnl_file['num_sources_resolve'] = df_issnl_file['submitted_issn'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])
df_issnl_file['num_sources_resolve'].fillna(value=0, inplace=True)

In [14]:
# actual_works_count = []
# subset = {issn: source_list for issn, source_list in smap.items() if len(source_list) > 1}
# for issn, source_list in tqdm(subset.items()):
#     if len(source_list) > 1:
#         for source_id in source_list:
#             url = f'https://api.openalex.org/works?filter=locations.source.id:S{source_id}'
#             params = {'mailto': 'jportenoy@ourresearch.org',
#                       'select': 'id',
#                       'per-page': 1}
#             r = make_request(url, params=params)
#             this_c = r.json()['meta']['count']
#             actual_works_count.append({
#                 'issn': issn,
#                 'source_id': source_id,
#                 'works_count': this_c,
#             })
# df_actual_works_count = pd.DataFrame(actual_works_count)

In [15]:
df_actual_works_count = pd.read_pickle('../data/issn_audit_20240301/df_actual_works_count.pickle')

In [16]:
x = df_actual_works_count[['source_id', 'works_count']].drop_duplicates()
x = x.set_index('source_id', verify_integrity=True)['works_count']
df_midjournal['works_count'] = df_midjournal['journal_id'].map(x)

In [17]:
%%time
url = "https://api.openalex.org/sources"
params = {
    'mailto': 'jportenoy@ourresearch.org',
    'group_by': 'issn',
    'bypass_cache': 'true',
}
data = []
for r in paginate_openalex(url, params=params):
    data.extend(r.json()['group_by'])

CPU times: user 7.87 s, sys: 394 ms, total: 8.26 s
Wall time: 4min 27s


In [18]:
df_sources = pd.DataFrame(data)
_rename = {
    'key': 'issn',
    'count': 'num_sources_in_openalex',
}
df_openalex_issn_sources_count = df_sources.rename(columns=_rename).drop(columns=['key_display_name'])

In [19]:
df_openalex_issn_sources_count['num_sources_in_openalex'].value_counts()

num_sources_in_openalex
1    216119
2      1395
3        12
Name: count, dtype: int64

In [20]:
df_issn_to_issnl = pd.read_sql_query("""select * from mid.journal_issn_to_issnl""", db.engine)

In [21]:
issn_to_issnl = df_issn_to_issnl.set_index('issn', verify_integrity=True)['issnl']

In [22]:
issnls = issn_to_issnl.unique()

In [23]:
def get_all_sources_db(issn, session):
    sources1 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter_by(issn=issn).all()
    sources2 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter(Source.issns.contains(issn)).all()
    return set(sources1 + sources2)


In [24]:
# try:
#     dups_data = {}
#     num_sources_verify = []
#     works_count_map = df_midjournal.set_index('journal_id', verify_integrity=True)['works_count']
#     for issn, row in tqdm(df_mult.set_index('issn', verify_integrity=True).iterrows(), total=len(df_mult)):
#         this_issn_data = []
#         sources = get_all_sources_db(issn, db.session)
#         num_sources_verify.append({
#             'issn': issn,
#             'num_sources_in_openalex': row['num_sources_in_openalex'],
#             'num_sources_db': len(sources),
#         })
#         for source in sources:
#             fuzzratio = rapidfuzz.fuzz.ratio(source.display_name, row['title'], processor=rapidfuzz.utils.default_process)
#             this_issn_data.append({
#                 'source_id': source.id,
#                 'fuzzratio': fuzzratio,
#                 'works_count': works_count_map[source.id]
#             })
#         dups_data[issn] = this_issn_data
# finally:
#     db.session.close()

In [25]:
fp = Path('../data/issn_audit_20240301/dups_data.pickle')
dups_data = pickle.loads(fp.read_bytes())

In [26]:
issnldups = df_midjournal.dropna(subset='issn')
issnldups = issnldups[issnldups['issn'].duplicated(keep=False)]
print(len(issnldups))
issnldups = issnldups[issnldups['issn'].isin(issnls)]
print(len(issnldups))
print(issnldups['issn'].nunique())
print(issnldups['journal_id'].nunique())

52
32
15
32


In [30]:
issnldups.sort_values('issn')

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
59673,2764753162,Crustacean research,0287-3478,"[""2189-5317""]",[2189-5317],journal,JP,"[Crustacean research, Researches on crustacea,...",,27.0
46609,4386621722,Crustacean Research,0287-3478,"[""0287-3478"", ""2189-5317""]","[0287-3478, 2189-5317]",journal,,,,426.0
35994,4393914634,Würzburger Jahrbücher für die Altertumswissens...,0342-5932,"[""0342-5932"", ""2365-8444""]","[0342-5932, 2365-8444]",journal,,,,0.0
58128,4306535007,Würzburger Jahrbücher für die Altertumswissens...,0342-5932,"[""2365-8444""]",[2365-8444],journal,,[],,487.0
46678,4386621727,Transactions of the Architectural Institute of...,0387-1185,"[""0387-1185"", ""2433-0027""]","[0387-1185, 2433-0027]",journal,,,4320801000.0,
153308,2764951900,Transactions of the Architectural Institute of...,0387-1185,"[""2433-0035""]",[2433-0035],journal,JP,[Transactions of the Architectural Institute o...,4320801000.0,
36101,4393915153,Urbanistica informazioni (Online),0392-5005,"[""0392-5005"", ""2239-4222""]","[0392-5005, 2239-4222]",journal,,,,0.0
94499,4306533961,URBANISTICA INFORMAZIONI,0392-5005,"[""2239-4222""]",[2239-4222],journal,,[],4310318000.0,254.0
36096,4393915164,La Legislazione penale (Online),0393-134X,"[""0393-134X"", ""2421-552X""]","[0393-134X, 2421-552X]",journal,,,,0.0
50374,4306518029,LA LEGISLAZIONE PENALE,0393-134X,"[""2421-552X""]",[2421-552X],journal,,[],,138.0


In [31]:
# try:
#     dups_data2 = {}
#     num_sources_verify = []
#     works_count_map = df_midjournal.set_index('journal_id', verify_integrity=True)['works_count']
#     for issn, row in tqdm(df_mult.set_index('issn', verify_integrity=True).iterrows(), total=len(df_mult)):
#         this_issn_data = []
#         sources = get_all_sources_db(issn, db.session)
#         num_sources_verify.append({
#             'issn': issn,
#             'num_sources_in_openalex': row['num_sources_in_openalex'],
#             'num_sources_db': len(sources),
#         })
#         for source in sources:
#             fuzzratio = rapidfuzz.fuzz.ratio(source.display_name, row['title'], processor=rapidfuzz.utils.default_process)
#             this_issn_data.append({
#                 'source_id': source.id,
#                 'fuzzratio': fuzzratio,
#                 'works_count': works_count_map[source.id]
#             })
#         dups_data2[issn] = this_issn_data
# finally:
#     db.session.close()

In [32]:
#go through each of these above (groupby issn, which should be the issnl), and dedup. delete any which aren't issnls (maybe check them for data later).
#add missing sources from data file (any issnl that doesn't have a source_id identified by issnl)
#this should leave us with no duplicate issns
#then figure out how to resolve any issns that resolve to multiple

just merge all of them

In [44]:
%%time
try:
    num_updated = 0
    n_issn = issnldups['issn'].nunique()
    not_actually_dups = []
    for issn, gbdf in tqdm(issnldups.set_index('journal_id', verify_integrity=True).groupby('issn'), total=n_issn):
        now = datetime.now(timezone.utc).isoformat()
        if len(gbdf) < 2:
            # come back to these
            not_actually_dups.append(issn)
            continue
        # sort by [works_count:desc, source_id:asc]
        gbdf = gbdf.sort_values(['works_count', 'journal_id'], ascending=[False, True])

        merge_into_id = int(gbdf.iloc[0].name)
        for id_to_merge in gbdf.iloc[1:].index:
            id_to_merge = int(id_to_merge)
            source = db.session.query(Source).filter_by(journal_id=id_to_merge).one()
            source.merge_into_id = merge_into_id
            source.merge_into_date = now
            source.updated_date = now

            db.session.add(source)

            note = f"merged because duplicate issnl."
            row = gbdf.loc[id_to_merge].fillna(value=0)
            if row['publisher_id'] or row['alternate_titles']:
                note += " may have additional info."

            sq = """INSERT INTO issn_audit_20240321.update5
                    (source_id, updated_date, merge_into_id, note)
                    VALUES(:source_id, :now, :merge_into_id, :note);"""
            db.session.execute(text(sq), {
                'source_id': source.id,
                'now': now,
                'merge_into_id': merge_into_id,
                'note': note,
            })

            # sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
            #     where "submitted_1348-0278" = :issn"""
            # db.session.execute(text(sq), {
            #     'issn': issn,
            # })

            num_updated += 1
    db.session.commit()
    print(f"{num_updated} updated")
finally:
    db.session.close

100%|██████████| 15/15 [00:08<00:00,  1.78it/s]


17 updated
CPU times: user 240 ms, sys: 37.7 ms, total: 278 ms
Wall time: 8.79 s


In [45]:
not_actually_dups

[]

In [30]:
def manual_merge(id_to_merge, merge_into_id, issn, note="manual merge."):

    now = datetime.now(timezone.utc).isoformat()
    source = db.session.query(Source).filter_by(journal_id=id_to_merge).one()
    source.merge_into_id = merge_into_id
    source.merge_into_date = now
    source.updated_date = now

    db.session.add(source)

    sq = """INSERT INTO issn_audit_20240321.update5
            (source_id, updated_date, merge_into_id, note)
            VALUES(:source_id, :now, :merge_into_id, :note);"""
    db.session.execute(text(sq), {
        'source_id': source.id,
        'now': now,
        'merge_into_id': merge_into_id,
        'note': note,
    })

    sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
        where "submitted_1348-0278" = :issn"""
    db.session.execute(text(sq), {
        'issn': issn,
    })

In [None]:
issnldups[issnldups['issn']=='0387-1185']

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
46595,4386621727,Transactions of the Architectural Institute of...,0387-1185,"[""0387-1185"", ""2433-0027""]","[0387-1185, 2433-0027]",journal,,,4320801000.0,
153235,2764951900,Transactions of the Architectural Institute of...,0387-1185,"[""2433-0035""]",[2433-0035],journal,JP,[Transactions of the Architectural Institute o...,4320801000.0,


In [37]:
issn_to_issnl[issn_to_issnl=='0387-1185']

issn
0387-1185    0387-1185
2433-0027    0387-1185
2433-0035    0387-1185
Name: issnl, dtype: object

In [35]:
df_issnl_file[df_issnl_file['submitted_issn']=='2433-0035']

Unnamed: 0,index,submitted_issn,issn,issnl,category,medium,frequency,country,center,language,start,end,title,comment,resolved,cluster_title,num_sources_resolve
17688,141933,2433-0035,2433-0035,0387-1185,Register,Online,Annual,JAPAN,Japan,Multiple languages,1965,1967,"Nihon Kenchiku Gakkai ronbun hokokushu, gogai,...",,False,Nihon Kenchiku Gakkai ronbun hokokushu/Nihon K...,1.0


In [38]:
try:
    manual_merge(4386621727, 2764951900, '0387-1185')
finally:
    db.session.close()