# update5 - TBD

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip

In [4]:
import pandas as pd
import numpy as np

In [5]:
import rapidfuzz

In [6]:
from app import db
from models import Source, ISSNtoISSNL
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

140618705400192: loading merged_into_institutions_dict
140618705400192: loading valid concept IDs
140618705400192: loading valid topic IDs


In [38]:
from cleanup.util import make_request, paginate_openalex


In [8]:
%%time
df_issnl_file = pd.read_sql_query('select * from issn_audit_20240321.issn_ic_datafile_expanded_202402', db.engine)

CPU times: user 1.33 s, sys: 283 ms, total: 1.61 s
Wall time: 4.87 s


In [9]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.83 s, sys: 76.4 ms, total: 1.91 s
Wall time: 2.92 s


In [10]:
print(len(df_midjournal))
df_midjournal.dropna(subset='issn', inplace=True)
print(len(df_midjournal))

261662
148415


In [11]:
smap = {}
for source_id, issn_list in tqdm(df_midjournal.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal)):
    for issn in issn_list:
        if issn in smap:
            smap[issn].append(source_id)
        else:
            smap[issn] = [source_id]

  0%|          | 0/148415 [00:00<?, ?it/s]

100%|██████████| 148415/148415 [00:00<00:00, 412052.52it/s] 


In [12]:
d = []
for issn, source_list in smap.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()
df_issnl_file['num_sources_resolve'] = df_issnl_file['issns'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])
df_issnl_file['num_sources_resolve'].fillna(value=0, inplace=True)

In [13]:
actual_works_count = []
subset = {issn: source_list for issn, source_list in smap.items() if len(source_list) > 1}
for issn, source_list in tqdm(subset.items()):
    if len(source_list) > 1:
        for source_id in source_list:
            url = f'https://api.openalex.org/works?filter=locations.source.id:S{source_id}'
            params = {'mailto': 'jportenoy@ourresearch.org',
                      'select': 'id',
                      'per-page': 1}
            r = make_request(url, params=params)
            this_c = r.json()['meta']['count']
            actual_works_count.append({
                'issn': issn,
                'source_id': source_id,
                'works_count': this_c,
            })
df_actual_works_count = pd.DataFrame(actual_works_count)

  0%|          | 0/8478 [00:00<?, ?it/s]

100%|██████████| 8478/8478 [53:05<00:00,  2.66it/s]  


In [14]:
df_actual_works_count.to_pickle('../data/issn_audit_20240301/df_actual_works_count.pickle')

In [18]:
df_actual_works_count[['source_id', 'works_count']].drop_duplicates(subset='source_id')

Unnamed: 0,source_id,works_count
0,4306509401,28717
1,4210198465,0
6,4210204044,0
7,4210226537,55
8,4210219813,1
...,...,...
17166,2765054898,24
17167,4210203785,478
17168,4210186117,0
17169,4386621981,7


In [19]:
x = df_actual_works_count[['source_id', 'works_count']].drop_duplicates()
x = x.set_index('source_id', verify_integrity=True)['works_count']
df_midjournal['works_count'] = df_midjournal['journal_id'].map(x)

In [22]:
df_midjournal.dropna(subset='works_count').sort_values('issn')

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
77457,4306504666,Boletín de la Academia Colombiana,0001-3773,"[""0001-3773""]",[0001-3773],journal,,[],,84.0
55224,4306504665,Boletín de la Academia colombiana,0001-3773,"[""0001-3773""]",[0001-3773],journal,,[],,326.0
52174,4306500404,Acta medica Philippina,0001-6071,"[""0001-6071"", ""2094-9278""]","[0001-6071, 2094-9278]",journal,,[],4.310320e+09,1369.0
40108,4210231482,Acta Medica Philippina,0001-6071,"[""2094-9278"",""0001-6071""]","[2094-9278, 0001-6071]",journal,PH,[The National Health Science journal],4.310320e+09,173.0
58242,4306500432,Acta Obstetrica et Gynaecologica Japonica,0001-6330,"[""0001-6330""]",[0001-6330],journal,,[],,3794.0
...,...,...,...,...,...,...,...,...,...,...
35607,4392785010,Journal of Physical Mathematics & its Applicat...,3033-3652,"[""3033-3652"", ""3033-3652""]","[3033-3652, 3033-3652]",journal,,,,5.0
7618,4393248715,Millatuna Jurnal Studi Islam,3046-4986,"[""3046-4986"", ""3046-4986""]","[3046-4986, 3046-4986]",journal,,,,6.0
55418,4393923809,THRIVE Health Science Journal,3046-9430,"[""3046-9430"", ""3046-9430""]","[3046-9430, 3046-9430]",journal,,,,0.0
35562,4210203606,The Year book of pulmonary disease,8756-3452,"[""8756-3452""]",[8756-3452],journal,US,"[Yearbook of pulmonary disease, Pulmonary dise...",4.310321e+09,0.0


In [25]:
from models.source import DELETED_SOURCE_ID

ImportError: cannot import name 'DELETED_SOURCE_ID' from 'models.source' (/home/hasone/code/ourresearch/openalex-guts/cleanup/notebooks/../../models/source.py)

In [26]:
DELETED_SOURCE_ID = 4317411217

In [33]:
%%time
invalid = df_issnl_file[df_issnl_file['valid_issns'].isna()]
try:
    num_updated = 0
    for issn in tqdm(invalid['issns'].values, total=len(invalid)):
        now = datetime.now(timezone.utc).isoformat()
        q_source = db.session.query(Source).filter_by(merge_into_id=None).filter_by(issn=issn)
        for source in q_source.all():
            source.merge_into_id = DELETED_SOURCE_ID
            source.merge_into_date = now
            source.updated_date = now

            db.session.add(source)

            note = f"invalid issn {issn}"
            sq = """INSERT INTO issn_audit_20240321.update5
                    (source_id, updated_date, merge_into_id, note)
                    VALUES(:source_id, :now, :merge_into_id, :note);"""
            db.session.execute(text(sq), {
                'source_id': source.id,
                'now': now,
                'merge_into_id': DELETED_SOURCE_ID,
                'note': note,
            })
            num_updated += 1
    db.session.commit()
    print(f"{num_updated} updated")
finally:
    db.session.close()

100%|██████████| 1518/1518 [06:04<00:00,  4.16it/s]


770 updated
CPU times: user 13.2 s, sys: 602 ms, total: 13.8 s
Wall time: 6min 18s


In [34]:
%%time
try:
    num_updated = 0
    for issn in tqdm(invalid['issns'].values, total=len(invalid)):
        sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
            where "submitted_1348-0278" = :issn"""
        db.session.execute(text(sq), {
            'issn': issn,
        })
    db.session.commit()
except:
    db.session.close()

100%|██████████| 1518/1518 [01:20<00:00, 18.85it/s]

CPU times: user 1.26 s, sys: 236 ms, total: 1.5 s
Wall time: 1min 20s





In [39]:
%%time
url = "https://api.openalex.org/sources"
params = {
    'mailto': 'jportenoy@ourresearch.org',
    'group_by': 'issn',
}
data = []
for r in paginate_openalex(url, params=params):
    data.extend(r.json()['group_by'])

CPU times: user 8.66 s, sys: 288 ms, total: 8.95 s
Wall time: 4min 32s


In [40]:
df_sources = pd.DataFrame(data)
_rename = {
    'key': 'issn',
    'count': 'num_sources_in_openalex',
}
df_openalex_issn_sources_count = df_sources.rename(columns=_rename).drop(columns=['key_display_name'])

In [41]:
df_openalex_issn_sources_count['num_sources_in_openalex'].value_counts()

num_sources_in_openalex
1    209737
2      8239
3       149
4         6
5         2
Name: count, dtype: int64

In [42]:
sq = """SELECT "index", "submitted_1348-0278", issn, issnl, category, medium, frequency, country, center, "language", "start", "end", title, "comment", resolved
FROM issn_audit_20240321.issn_ic_datafile_202402;"""
df_file_simple = pd.read_sql_query(sq, db.engine)

In [43]:
df_issn_to_issnl = pd.read_sql_query("""select * from mid.journal_issn_to_issnl""", db.engine)

In [45]:
issn_to_issnl = df_issn_to_issnl.set_index('issn', verify_integrity=True)['issnl']

In [47]:
df_mult = df_openalex_issn_sources_count[df_openalex_issn_sources_count['num_sources_in_openalex']>1]
len(df_mult)

8396

In [49]:
df_mult['issnl'] = df_mult['issn'].map(issn_to_issnl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mult['issnl'] = df_mult['issn'].map(issn_to_issnl)


In [51]:
df_mult['is_issnl'] = df_mult['issn'] == df_mult['issnl']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mult['is_issnl'] = df_mult['issn'] == df_mult['issnl']


In [52]:
df_mult['is_issnl'].value_counts()

is_issnl
True     4220
False    4176
Name: count, dtype: int64

In [53]:
df_mult[df_mult['is_issnl']==False]

Unnamed: 0,issn,num_sources_in_openalex,issnl,is_issnl
2126,0012-0200,2,1864-0303,False
3201,0018-9855,2,0016-9420,False
3402,0020-2460,2,2973-7842,False
6496,0035-2047,2,3001-5898,False
7476,0040-7550,3,2212-0521,False
...,...,...,...,...
216413,3009-7770,2,1110-7774,False
216415,3009-7835,2,1110-8460,False
216417,3009-7886,2,3009-7487,False
216636,3024-8671,2,0854-9559,False


In [57]:
issn_to_issnl[issn_to_issnl=='0016-9420']

issn
0016-9420    0016-9420
0018-9855    0016-9420
0435-8600    0016-9420
2195-0237    0016-9420
Name: issnl, dtype: object

In [60]:
df_midjournal[df_midjournal['issn']=='0016-9420']

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
249631,1002663397,GRURRR. Gewerblicher Rechtsschutz und Urheberr...,0016-9420,"[""0016-9420"", ""0018-9855"", ""0435-8600"", ""2195-...","[0016-9420, 0018-9855, 0435-8600, 2195-0237]",journal,DE,[GRUR],,29.0


In [61]:
df_midjournal[df_midjournal['issn']=='0018-9855']

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
17428,4695008,IIC - International Review of Intellectual Pro...,0018-9855,"[""2195-0237"", ""0018-9855""]","[2195-0237, 0018-9855]",journal,DE,[International review of intellectual property...,4310320000.0,1420.0


In [66]:
def get_all_sources_db(issn, session):
    sources1 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter_by(issn=issn).all()
    sources2 = session.query(Source).options(Load(Source).lazyload('*')).filter_by(merge_into_id=None).filter(Source.issns.contains(issn)).all()
    return set(sources1 + sources2)


In [73]:
issn = '0018-9855'
sources = get_all_sources_db(issn, db.session)

In [74]:
sources

{<Source ( http://localhost:5007/S1002663397?apiurls ) 1002663397 GRURRR. Gewerblicher Rechtsschutz und Urheberrecht, Rechtsprechungs-Report/GRUR-DVD/GRUR-CD/IIC/Gewerblicher Rechtsschutz und Urheberrecht/Gewerblicher Rechtsschutz und Urheberrecht. Internationaler Teil>,
 <Source ( http://localhost:5007/S4695008?apiurls ) 4695008 IIC - International Review of Intellectual Property and Competition Law>}

In [71]:
df_mult['title'] = df_mult['issn'].map(df_issnl_file.set_index('issns', verify_integrity=True)['cluster_title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mult['title'] = df_mult['issn'].map(df_issnl_file.set_index('issns', verify_integrity=True)['cluster_title'])


In [77]:

title = df_mult[df_mult['issn']==issn].iloc[0]['title']

In [79]:
for source in sources:
    fuzzratio = rapidfuzz.fuzz.ratio(source.display_name, title, processor=rapidfuzz.utils.default_process)
    print(fuzzratio)

100.0
29.92700729927007


In [86]:
try:
    dups_data = {}
    num_sources_verify = []
    works_count_map = df_midjournal.set_index('journal_id', verify_integrity=True)['works_count']
    for issn, row in tqdm(df_mult.set_index('issn', verify_integrity=True).iterrows(), total=len(df_mult)):
        this_issn_data = []
        sources = get_all_sources_db(issn, db.session)
        num_sources_verify.append({
            'issn': issn,
            'num_sources_in_openalex': row['num_sources_in_openalex'],
            'num_sources_db': len(sources),
        })
        for source in sources:
            fuzzratio = rapidfuzz.fuzz.ratio(source.display_name, row['title'], processor=rapidfuzz.utils.default_process)
            this_issn_data.append({
                'source_id': source.id,
                'fuzzratio': fuzzratio,
                'works_count': works_count_map[source.id]
            })
        dups_data[issn] = this_issn_data
finally:
    db.session.close()

100%|██████████| 8396/8396 [13:27<00:00, 10.40it/s]


In [99]:
dups_data

{'0001-3773': [{'source_id': 4306504666,
   'fuzzratio': 100.0,
   'works_count': 84.0},
  {'source_id': 4306504665, 'fuzzratio': 100.0, 'works_count': 326.0}],
 '0001-6071': [{'source_id': 4210231482,
   'fuzzratio': 65.67164179104478,
   'works_count': 173.0},
  {'source_id': 4306500404,
   'fuzzratio': 65.67164179104478,
   'works_count': 1369.0}],
 '0001-6330': [{'source_id': 4306500432,
   'fuzzratio': 100.0,
   'works_count': 3794.0},
  {'source_id': 2754219712, 'fuzzratio': 100.0, 'works_count': 75.0}],
 '0001-6365': [{'source_id': 4306500438,
   'fuzzratio': 100.0,
   'works_count': 833.0},
  {'source_id': 4306500439, 'fuzzratio': 100.0, 'works_count': 765.0}],
 '0001-7302': [{'source_id': 4306500610,
   'fuzzratio': 23.529411764705888,
   'works_count': 149.0},
  {'source_id': 4306500609,
   'fuzzratio': 23.529411764705888,
   'works_count': 2132.0}],
 '0001-7884': [{'source_id': 43570703,
   'fuzzratio': 100.0,
   'works_count': 258.0},
  {'source_id': 50311296,
   'fuzzratio

In [88]:
x = pd.DataFrame(num_sources_verify)
x[x['num_sources_db']!=x['num_sources_in_openalex']]

Unnamed: 0,issn,num_sources_in_openalex,num_sources_db
828,0794-5698,2,1
829,0794-9316,2,1
1190,1099-0046,2,1
1970,1443-458X,2,1
2063,1522-9580,2,1
2082,1529-1529,2,1
3013,1826-9850,2,3
3475,1936-9956,2,1
4341,2091-0762,2,1
6028,2356-9816,2,1


In [112]:
issnldups = df_midjournal.dropna(subset='issn')
issnldups = issnldups[issnldups['issn'].duplicated(keep=False)]
print(len(issnldups))
print(issnldups['issn'].nunique())
print(issnldups['journal_id'].nunique())

8430
4170
8430


In [113]:
#go through each of these above (groupby issn, which should be the issnl), and dedup. delete any which aren't issnls (maybe check them for data later).
#add missing sources from data file (any issnl that doesn't have a source_id identified by issnl)
#this should leave us with no duplicate issns
#then figure out how to resolve any issns that resolve to multiple

In [135]:
%%time
try:
    num_updated = 0
    n_issn = issnldups['issn'].nunique()
    no_dd_found = []
    for issn, gbdf in tqdm(issnldups.set_index('journal_id', verify_integrity=True).groupby('issn'), total=n_issn):
        now = datetime.now(timezone.utc).isoformat()
        try:
            dd = dups_data[issn]
        except KeyError:
            no_dd_found.append(issn)
            continue
        if len(dd) != 2:
            # come back to these
            continue
        dd.sort(key=lambda x: x['works_count'], reverse=True)
        conditions = [
            dd[0]['works_count'] >= dd[1]['works_count'],
            dd[0]['fuzzratio'] >= 95,
        ]
        if all(conditions):
            # all conditions met. merge second source into first
            id_to_merge = dd[1]['source_id']
            merge_into_id = dd[0]['source_id']
            source = db.session.query(Source).filter_by(journal_id=id_to_merge).one()
            source.merge_into_id = merge_into_id
            source.merge_into_date = now
            source.updated_date = now

            db.session.add(source)

            note = f"merged because lower works count."
            row = gbdf.loc[id_to_merge].fillna(value=0)
            if row['publisher_id'] or row['alternate_titles']:
                note += " may have additional info."

            sq = """INSERT INTO issn_audit_20240321.update5
                    (source_id, updated_date, merge_into_id, note)
                    VALUES(:source_id, :now, :merge_into_id, :note);"""
            db.session.execute(text(sq), {
                'source_id': source.id,
                'now': now,
                'merge_into_id': merge_into_id,
                'note': note,
            })

            sq = """update issn_audit_20240321.issn_ic_datafile_202402 set resolved = true
                where "submitted_1348-0278" = :issn"""
            db.session.execute(text(sq), {
                'issn': issn,
            })

            num_updated += 1
    db.session.commit()
    print(f"{num_updated} updated")
finally:
    db.session.close

100%|██████████| 4170/4170 [22:34<00:00,  3.08it/s] 


2337 updated
CPU times: user 44.7 s, sys: 1.62 s, total: 46.3 s
Wall time: 23min 20s


In [117]:
dd

[{'source_id': 4306504665, 'fuzzratio': 100.0, 'works_count': 326.0},
 {'source_id': 4306504666, 'fuzzratio': 100.0, 'works_count': 84.0}]

In [126]:
row

Unnamed: 0_level_0,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,works_count
journal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4306504665,Boletín de la Academia colombiana,0001-3773,"[""0001-3773""]",[0001-3773],journal,,[],,326.0
4306504666,Boletín de la Academia Colombiana,0001-3773,"[""0001-3773""]",[0001-3773],journal,,[],,84.0


In [125]:
if row['publisher_id'].fillna(value=0).iloc[0]:
    print('d')

In [127]:
row.iloc[0]['alternate_titles']

[]

In [137]:
import pickle
outfp = Path('../data/issn_audit_20240301/dups_data.pickle')
outfp.write_bytes(pickle.dumps(dups_data))

893665