# update3 post-mortem

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip

In [4]:
import pandas as pd
import numpy as np

In [5]:
from app import db
from models import Source, ISSNtoISSNL
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

140259440157056: loading merged_into_institutions_dict
140259440157056: loading valid concept IDs
140259440157056: loading valid topic IDs


In [12]:
%%time
df_issnl_file = pd.read_excel('../data/issn_audit_20240301/ISSN-Ls_openalex.xlsx', sheet_name='Sheet')

CPU times: user 38.9 s, sys: 157 ms, total: 39.1 s
Wall time: 39.1 s


In [13]:
df_issnl_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215085 entries, 0 to 215084
Data columns (total 39 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Original line number  215085 non-null  int64 
 1   ISSNs                 215085 non-null  object
 2   Valid ISSNs           213567 non-null  object
 3   ISSN-L                213567 non-null  object
 4   Diagnostics           145124 non-null  object
 5   Cluster Title         213487 non-null  object
 6   Print issn            176267 non-null  object
 7   Print category        176267 non-null  object
 8   Print medium          176267 non-null  object
 9   Print frequency       176243 non-null  object
 10  Print country         176267 non-null  object
 11  Print center          176267 non-null  object
 12  Print language        176110 non-null  object
 13  Print start           176267 non-null  object
 14  Print end             176267 non-null  object
 15  Print title      

In [15]:
col_rename = {
    colname: colname.lower().replace('-', '').replace(' ', '_')
    for colname in df_issnl_file.columns
}
df_issnl_file.rename(columns=col_rename, inplace=True)

In [17]:
df_issnl_file.set_index('issns', verify_integrity=True, inplace=True)

In [19]:
%%time
df_issnl_file.to_sql('issn_ic_datafile_expanded_202402',
                     db.engine,
                     schema='issn_audit_20240321',
                     index=True,
                     chunksize=10000,
                     method='multi')

CPU times: user 1min 33s, sys: 655 ms, total: 1min 34s
Wall time: 4min 27s


215085

In [26]:
%%time
df_issnl_file_fromdb = pd.read_sql_query('select * from issn_audit_20240321.issn_ic_datafile_expanded_202402', db.engine)

CPU times: user 1.29 s, sys: 241 ms, total: 1.53 s
Wall time: 3.38 s


In [27]:
issn_to_issnl_fromdb = df_issnl_file_fromdb[['valid_issns', 'issnl']]
print(len(issn_to_issnl_fromdb))
issn_to_issnl_fromdb.dropna(inplace=True)
print(len(issn_to_issnl_fromdb))
issn_to_issnl_fromdb.drop_duplicates(inplace=True)
print(len(issn_to_issnl_fromdb))

215085
213567
213383


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issn_to_issnl_fromdb.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issn_to_issnl_fromdb.drop_duplicates(inplace=True)


In [33]:
issn_to_issnl_fromdb.rename(columns={'valid_issns': 'issn'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  issn_to_issnl_fromdb.rename(columns={'valid_issns': 'issn'}, inplace=True)


In [30]:
df_issnl = pd.read_sql_query("""select * from mid.journal_issn_to_issnl""", db.engine)

In [34]:
m = issn_to_issnl_fromdb.merge(df_issnl, how='outer', on=['issn', 'issnl'], indicator=True)

In [36]:
m['_merge'].value_counts()

_merge
both          213381
right_only       108
left_only          2
Name: count, dtype: int64

In [40]:
try:
    for _, row in m[m['_merge']=='left_only'].iterrows():
        now = datetime.now(timezone.utc).isoformat()
        o = ISSNtoISSNL(issn=row['issn'], issnl=row['issn'], updated_date=now)
        db.session.add(o)
    db.session.commit()
finally:
    db.session.close()

In [6]:
%%time
sq = """select journal_id, display_name, issn, issns, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.43 s, sys: 66 ms, total: 1.49 s
Wall time: 2.35 s


In [7]:
df_midjournal.dropna(subset='issn', inplace=True)

In [8]:
df_midjournal.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148415 entries, 1 to 261660
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   journal_id        148415 non-null  int64  
 1   display_name      148415 non-null  object 
 2   issn              148415 non-null  object 
 3   issns             148415 non-null  object 
 4   type              148290 non-null  object 
 5   country_code      109641 non-null  object 
 6   alternate_titles  118625 non-null  object 
 7   publisher_id      67149 non-null   float64
dtypes: float64(1), int64(1), object(6)
memory usage: 10.2+ MB


In [9]:
df_midjournal['issn'].duplicated().sum()

4260

In [13]:
df_midjournal['no_issnl_found'] = ~(df_midjournal['issn'].isin(df_issnl['issn'].values))
df_midjournal['no_issnl_found'].value_counts()

no_issnl_found
False    144547
True       3868
Name: count, dtype: int64

In [14]:
df_midjournal[df_midjournal['no_issnl_found']]

Unnamed: 0,journal_id,display_name,issn,issns,type,country_code,alternate_titles,publisher_id,no_issnl_found
82,4391422086,Proceedings of the Association for Japanese Li...,1531-5533,"[""1531-5533""]",journal,,,,True
112,4391633437,Co-Catalyst Journal of Science Education Resea...,3026-1597,"[""3026-1597""]",journal,,,4.310315e+09,True
239,4391422089,Nihonkai Cetology,0918-3930,"[""0918-3930"", ""2435-3760""]",journal,,,,True
250,4391633469,Metrik Serial Teknologi dan Sains,2774-2989,"[""2774-2989""]",journal,,,,True
251,4391633474,MANTRA Jurnal Sastra Indonesia (Sastra Bahasa ...,2987-8969,"[""2987-8969""]",journal,,,,True
...,...,...,...,...,...,...,...,...,...
259612,4210178947,Proceedings of the Aristotelian Society (Hardb...,0066-7372,"[""0066-7372"",""0066-7373""]",journal,,[],4.310321e+09,True
260162,4386621720,Les cahiers du numérique,1469-3380,"[""1469-3380""]",journal,,,,True
260183,4306524739,Protokolle zur Bibel,1996-0042,"[""2412-2467""]",journal,,[],,True
260811,4387287589,Vjesnik dalmatinskih arhiva (Online),2757-0932,"[""2757-0932"", ""2806-8459""]",journal,,,,True


In [19]:
df_midjournal[df_midjournal['issn'].duplicated(keep=False)].sort_values('issn')

Unnamed: 0,journal_id,display_name,issn,issns,type,country_code,alternate_titles,publisher_id
54910,4306504665,Boletín de la Academia colombiana,0001-3773,"[""0001-3773""]",journal,,[],
136919,4306504666,Boletín de la Academia Colombiana,0001-3773,"[""0001-3773""]",journal,,[],
46084,4306500404,Acta medica Philippina,0001-6071,"[""0001-6071"", ""2094-9278""]",journal,,[],4.310320e+09
79744,4210231482,Acta Medica Philippina,0001-6071,"[""2094-9278"",""0001-6071""]",journal,PH,[The National Health Science journal],4.310320e+09
60458,4306500432,Acta Obstetrica et Gynaecologica Japonica,0001-6330,"[""0001-6330""]",journal,,[],
...,...,...,...,...,...,...,...,...
91498,4387277837,Parasites Hosts and Diseases,2982-5164,"[""2982-5164"", ""2982-6799""]",journal,,,4.310319e+09
102645,4387281090,International Journal of Psychiatric Trainees,3005-3870,"[""3005-3870"", ""2957-4080""]",journal,,,
78775,4387292761,International Journal of Psychiatric Trainees,3005-3870,"[""3005-3870""]",journal,,,
89862,50342286,Yearbook of Pulmonary Disease,8756-3452,"[""8756-3452""]",journal,,[],4.310321e+09


In [20]:
%%time
# Source IDs that have already been through the updates should be protected
sq = """select source_id from issn_audit_20240321.update1 where reverted is false;"""
df1 = pd.read_sql_query(sq, db.engine)

sq = """select source_id from issn_audit_20240321.update2 where reverted is false;"""
df2 = pd.read_sql_query(sq, db.engine)

df_protected = pd.concat([df1, df2])

CPU times: user 77.9 ms, sys: 0 ns, total: 77.9 ms
Wall time: 837 ms


In [22]:
df_protected

Unnamed: 0,source_id
0,4210234080
1,4387277910
2,4306499931
3,4210177666
4,4210223324
...,...
8854,4393920245
8855,4393920246
8856,4393920247
8857,4393920248


In [28]:
df_midjournal['is_protected'] = df_midjournal['journal_id'].isin(df_protected['source_id'].values)

In [30]:
df_midjournal['is_protected'].value_counts()

is_protected
True     134124
False     14291
Name: count, dtype: int64

In [6]:
%%time
sq = """select * from issn_audit_20240321.issn_ic_datafile_202402"""
df_datafile = pd.read_sql_query(sq, db.engine)

CPU times: user 599 ms, sys: 127 ms, total: 726 ms
Wall time: 1.98 s


In [7]:
# get dict of issnl to issn
x = df_datafile.dropna(subset=['issnl'])
x = x[x['category'].isin(['Register', 'Work', 'Free', 'Validation Request'])]
x = x[['issn', 'issnl']]
issnl_to_issn = {}
ignore = [
    '0000-0000',
]
x = x[~(x['issnl'].isin(ignore))]
for name, gbdf in x.groupby('issnl'):
    if name in ignore:
        continue
    issnl_to_issn[name] = gbdf['issn'].tolist()
len(issnl_to_issn)

140984

In [17]:
issn = '0001-5113'
try:
    source = db.session.query(Source).options(Load(Source).lazyload('*')).filter_by(issn=issn).all()
    if source is None:
        source = db.session.query(Source).options(Load(Source).lazyload('*')).filter(Source.issns.contains(issn)).all()
finally:
    db.session.close()


In [18]:
source

[<Source ( http://localhost:5007/S86406033?apiurls ) 86406033 Acta Adriatica>,
 <Source ( http://localhost:5007/S2764458511?apiurls ) 2764458511 Acta Adriatica: International Journal of Marine Sciences>]