# update4 post-mortem

* mid.journal.issns and mid.journal.issns_text_array should have identical information
* every ISSNL in mid.journal.issn should be the first item in the lists mid.journal.issns, and mid.journal.issns_text_array
* there should be no duplicate ISSNL (no duplicate mid.journal.issn)
* every ISSN should resolve to one Source (only one entry in mid.journal has the ISSN in issns/issns_text_array)

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [3]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip

In [4]:
import pandas as pd
import numpy as np

In [21]:
import rapidfuzz

In [5]:
from app import db
from models import Source, ISSNtoISSNL
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

139712131965312: loading merged_into_institutions_dict
139712131965312: loading valid concept IDs
139712131965312: loading valid topic IDs


In [6]:
%%time
df_issnl_file = pd.read_sql_query('select * from issn_audit_20240321.issn_ic_datafile_expanded_202402', db.engine)

CPU times: user 1.34 s, sys: 185 ms, total: 1.53 s
Wall time: 2.61 s


In [7]:
%%time
sq = """select journal_id, display_name, issn, issns, issns_text_array, type, country_code, alternate_titles, publisher_id
    from mid.journal
    where merge_into_id is null"""
df_midjournal = pd.read_sql_query(sq, db.engine)

CPU times: user 1.95 s, sys: 33.5 ms, total: 1.99 s
Wall time: 2.44 s


In [8]:
print(len(df_midjournal))
df_midjournal.dropna(subset='issn', inplace=True)
print(len(df_midjournal))

261661
148415


In [9]:
%%time
sq = """select journal_id, paper_count, citation_count from mid.citation_journals_mv;"""
df_source_counts = pd.read_sql_query(sq, db.engine)

CPU times: user 300 ms, sys: 0 ns, total: 300 ms
Wall time: 563 ms


In [10]:
df_midjournal = df_midjournal.merge(df_source_counts, how='left', on='journal_id')

In [11]:
df_midjournal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148415 entries, 0 to 148414
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   journal_id        148415 non-null  int64  
 1   display_name      148415 non-null  object 
 2   issn              148415 non-null  object 
 3   issns             148415 non-null  object 
 4   issns_text_array  148415 non-null  object 
 5   type              148290 non-null  object 
 6   country_code      109641 non-null  object 
 7   alternate_titles  118625 non-null  object 
 8   publisher_id      67149 non-null   float64
 9   paper_count       116515 non-null  float64
 10  citation_count    116515 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 12.5+ MB


### mid.journal.issns and mid.journal.issns_text_array should have identical information

In [12]:
s1 = df_midjournal['issns'].apply(json.loads)
s2 = df_midjournal['issns_text_array']
(s1 == s2).value_counts()

True    148415
Name: count, dtype: int64

We're good!

### every ISSNL in mid.journal.issn should appear in the lists mid.journal.issns, and mid.journal.issns_text_array

In [13]:
x = df_midjournal.apply(lambda row: row['issn'] == row['issns_text_array'][0], axis=1)
x.value_counts()

True     146815
False      1600
Name: count, dtype: int64

In [12]:
df_midjournal[x==False]

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id
257,4210197708,Energy research letters,2652-6433,"[""2652-6514"", ""2652-6433""]","[2652-6514, 2652-6433]",journal,AU,[],
1016,4210231708,eBMJ,1468-5833,"[""0267-0623"", ""1468-5833""]","[0267-0623, 1468-5833]",journal,GB,[eBritish medical journal (London)],4.310320e+09
1136,4210201839,Intrinsically disordered proteins,2169-0707,"[""2169-0693"", ""2169-0707""]","[2169-0693, 2169-0707]",journal,US,"[IDP, Intrinsically Disord Proteins]",4.310321e+09
1206,4210191895,Polilingvialʹnostʹ i transkulʹturnye praktiki,2618-897X,"[""2618-8988"",""2618-897X""]","[2618-8988, 2618-897X]",journal,RU,[Polylinguality and transcultural practices],4.310313e+09
1449,155963401,Nations and nationalism,1354-5078,"[""1345-5078"", ""1469-8129"", ""1354-5078""]","[1345-5078, 1469-8129, 1354-5078]",journal,GB,[],4.310321e+09
...,...,...,...,...,...,...,...,...,...
260070,2764864708,Sudan journal of medical sciences,1858-5051,"[""1858-8530"", ""1858-5051""]","[1858-8530, 1858-5051]",journal,SD,[],4.310319e+09
260754,4210209763,"Revista respaldo : educación, tecnología y des...",1659-3464,"[""2215-4345"", ""1659-3464"", ""2215-4337""]","[2215-4345, 1659-3464, 2215-4337]",journal,CR,[],4.310319e+09
260982,4210192115,Ecología austral/Ecología austral,0327-5477,"[""1667-7838"", ""0327-5477"", ""1667-782X""]","[1667-7838, 0327-5477, 1667-782X]",journal,AR,[],
261107,989140491,Klinik psikofarmakoloji bülteni,1302-9657,"[""1017-7833"", ""1302-9657""]","[1017-7833, 1302-9657]",journal,TR,[Bulletin of clinical psychopharmacology],4.310321e+09


In [15]:
df_issnl_file['issns'].duplicated().sum()

0

In [19]:
invalid = df_issnl_file[df_issnl_file['valid_issns'].isna()]

In [20]:
invalid

Unnamed: 0,issns,original_line_number,valid_issns,issnl,diagnostics,cluster_title,print_issn,print_category,print_medium,print_frequency,...,disk_category,disk_medium,disk_frequency,disk_country,disk_center,disk_language,disk_start,disk_end,disk_title,disk_comment
646,0855-1448,636,,,0855-1448: ISSN category fre,,,,,,...,,,,,,,,,,
1486,2091-0916,1466,,,2091-0916: ISSN category fre,,,,,,...,,,,,,,,,,
2003,0794-4721,1978,,,0794-4721: ISSN category val,,,,,,...,,,,,,,,,,
2914,1936-9956,2878,,,1936-9956: ISSN category val,,,,,,...,,,,,,,,,,
2979,2091-1459,2943,,,2091-1459: ISSN category val,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215072,7519-1735,215073,,,7519-1735: Unknown or invalid ISSN code 7519-1735,,,,,,...,,,,,,,,,,
215081,8941-1991,215082,,,8941-1991: Unknown or invalid ISSN code 8941-1991,,,,,,...,,,,,,,,,,
215082,9999-9870,215083,,,9999-9870: Unknown or invalid ISSN code 9999-9870,,,,,,...,,,,,,,,,,
215083,9999-9997,215084,,,9999-9997: Unknown or invalid ISSN code 9999-9997,,,,,,...,,,,,,,,,,


In [11]:
df_midjournal['issn'].duplicated().sum()

4260

In [12]:
%%time
sq = """select journal_id, paper_count, citation_count from mid.citation_journals_mv;"""
df_source_counts = pd.read_sql_query(sq, db.engine)

CPU times: user 282 ms, sys: 17.7 ms, total: 300 ms
Wall time: 886 ms


In [14]:
df_midjournal = df_midjournal.merge(df_source_counts, how='left', on='journal_id')

In [18]:
df_midjournal['paper_count'].isna().sum()

31900

### all issn-ls must resolve to one source

In [19]:
df_issnl_dups = df_midjournal[df_midjournal['issn'].duplicated(keep=False)]
print(len(df_issnl_dups))

8430


In [20]:
df_issnl_dups

Unnamed: 0,journal_id,display_name,issn,issns,type,country_code,alternate_titles,publisher_id,paper_count,citation_count
432,2754219712,Acta obstetrica et gynaecologica Japonica,0001-6330,"[""0001-6330""]",journal,JP,[],4.323253e+09,3872.0,35.0
773,4210205244,Perspectivas en nutricion humana,0124-4108,"[""0124-4108"",""2248-454X""]",journal,CO,[],4.310319e+09,126.0,116.0
775,4210211489,Jurnal Akuntansi Kontemporer: kajian ilmu akun...,2085-1189,"[""2085-1189"",""2685-9971""]",journal,ID,[Kajian ilmu akuntansi dan terapannya],4.310315e+09,61.0,25.0
864,4210234779,Diabetologia Notes de lecture,2100-0719,"[""2100-0719"",""2102-6246""]",journal,FR,[],4.310320e+09,44.0,3.0
1147,4210191895,Polilingvialʹnostʹ i transkulʹturnye praktiki,2618-897X,"[""2618-8988"",""2618-897X""]",journal,RU,[Polylinguality and transcultural practices],4.310313e+09,284.0,75.0
...,...,...,...,...,...,...,...,...,...,...
143577,4210178777,IRE transactions on electron devices,0096-2430,"[""0096-2430"",""2379-8661""]",journal,US,"[Electron devices, Transactions on electron de...",4.310320e+09,1008.0,4373.0
144381,4210204203,Bulletin international des sociétés de secours...,1816-9678,"[""1816-9678"",""2059-9196""]",journal,GB,[],4.310312e+09,1255.0,81.0
145906,4210203785,Arts and social sciences journal,2151-6200,"[""2151-6200""]",journal,US,[],4.310320e+09,403.0,576.0
147179,3035235436,Journal of Modern Power Systems and Clean Energy,2196-5420,"[""2196-5420"", ""2196-5625""]",journal,DE,[],4.310320e+09,1069.0,21442.0


In [20]:
%%time
# Source IDs that have already been through the updates should be protected
sq = """select source_id from issn_audit_20240321.update1 where reverted is false;"""
df1 = pd.read_sql_query(sq, db.engine)

sq = """select source_id from issn_audit_20240321.update2 where reverted is false;"""
df2 = pd.read_sql_query(sq, db.engine)

df_protected = pd.concat([df1, df2])

CPU times: user 77.9 ms, sys: 0 ns, total: 77.9 ms
Wall time: 837 ms


In [22]:
df_protected

Unnamed: 0,source_id
0,4210234080
1,4387277910
2,4306499931
3,4210177666
4,4210223324
...,...
8854,4393920245
8855,4393920246
8856,4393920247
8857,4393920248


In [28]:
df_midjournal['is_protected'] = df_midjournal['journal_id'].isin(df_protected['source_id'].values)

In [30]:
df_midjournal['is_protected'].value_counts()

is_protected
True     134124
False     14291
Name: count, dtype: int64

In [31]:
smap = {}
for source_id, issn_list in tqdm(df_midjournal.set_index('journal_id', verify_integrity=True)['issns_text_array'].items(), total=len(df_midjournal)):
    for issn in issn_list:
        if issn in smap:
            smap[issn].append(source_id)
        else:
            smap[issn] = [source_id]

# for issn in df_issnl_file['issns'].values:
#     s = df_midjournal['issns_text_array'].apply(lambda x: issn in x)
#     break

100%|██████████| 148415/148415 [00:00<00:00, 362349.94it/s]


In [34]:
d = []
for issn, source_list in smap.items():
    d.append({
        'issn': issn,
        'num_sources': len(source_list),
    })
_df = pd.DataFrame(d)
_df['num_sources'].value_counts()

num_sources
1    209916
2      8272
3       196
4         8
5         2
Name: count, dtype: int64

In [37]:
df_issnl_file['num_sources_resolve'] = df_issnl_file['issns'].map(_df.set_index('issn', verify_integrity=True)['num_sources'])

In [40]:
df_issnl_file['num_sources_resolve'].fillna(value=0, inplace=True)

In [41]:
df_issnl_file['num_sources_resolve'].value_counts()

num_sources_resolve
1.0    206164
2.0      8217
0.0       498
3.0       196
4.0         8
5.0         2
Name: count, dtype: int64

In [42]:
df_issnl_file[df_issnl_file['num_sources_resolve']==2]

Unnamed: 0,issns,original_line_number,valid_issns,issnl,diagnostics,cluster_title,print_issn,print_category,print_medium,print_frequency,...,disk_medium,disk_frequency,disk_country,disk_center,disk_language,disk_start,disk_end,disk_title,disk_comment,num_sources_resolve
65,1110-2098,66,1110-2098,1110-2098,1110-2098: Duplicates another ISSN in the file.,Menoufia Medical Journal /Menoufia Medical Jou...,1110-2098,Register,Print,Semiannual,...,,,,,,,,,,2.0
74,0966-1905,75,0966-1905,0966-1905,,BMUS bulletin,0966-1905,Register,Print,Quarterly,...,,,,,,,,,,2.0
132,2411-2933,130,2411-2933,2411-3123,2411-2933: Duplicates another ISSN in the file.,International journal for innovation education...,2411-3123,Register,Print,Monthly,...,,,,,,,,,,2.0
228,0889-325X,222,0889-325X,0889-325X,0889-325X: Duplicates another ISSN in the file.,ACI materials journal,0889-325X,Register,Print,Bimonthly,...,,,,,,,,,,2.0
229,1783-2446,223,1783-2446,1783-2446,1783-2446: Duplicates another ISSN in the file.,Journal of the European Society of Women in Th...,1783-2454,Register,Print,Annual,...,,,,,,,,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214580,2734-0759,214579,2734-0759,1225-0759,2734-0759: Duplicates another ISSN in the file.,Jaemu gwalli yeongu/Jae'mu gwanri yeon'gu,1225-0759,Register,Print,Quarterly,...,,,,,,,,,,2.0
214581,2734-0767,214580,2734-0767,1226-6787,2734-0767: Duplicates another ISSN in the file.,Misul chiryo yeongu/Misul ci'ryo yeon'gu,1226-6787,Register,Print,Semiannual,...,,,,,,,,,,2.0
214679,2764-1546,214679,2764-1546,2525-7374,2764-1546: Duplicates another ISSN in the file.,Braspen Journal,2525-7374,Register,Print,Quarterly,...,,,,,,,,,,2.0
215023,2815-5890,215024,2815-5890,2615-9783,2815-5890: Duplicates another ISSN in the file.,Vietnam Journal of Earth sciences/Vietnam Jour...,2615-9783,Register,Print,Quarterly,...,,,,,,,,,,2.0


In [44]:
df_midjournal[df_midjournal['issns_text_array'].apply(lambda x: '0966-1905' in x)]

Unnamed: 0,journal_id,display_name,issn,issns,issns_text_array,type,country_code,alternate_titles,publisher_id,paper_count,citation_count
25890,2764892950,BMUS Bulletin,0966-1905,"[""0966-1905""]",[0966-1905],journal,,[],4310320000.0,1118.0,330.0
33326,4210217177,BMUS bulletin,0966-1905,"[""0966-1905""]",[0966-1905],journal,GB,[British Medical Ultrasound Society bulletin],4310320000.0,1118.0,330.0


In [46]:
from cleanup.util import make_request

In [49]:
actual_works_count = {}
n = len([source_list for source_list in smap.values() if len(source_list) > 1])
for issn, source_list in tqdm(smap.items(), total=n):
    if len(source_list) > 1:
        actual_works_count[issn] = []
        for source_id in source_list:
            url = f'https://api.openalex.org/works?filter=locations.source.id:S{source_id}'
            params = {'mailto': 'jportenoy@ourresearch.org'}
            r = make_request(url, params=params)
            actual_works_count[issn].append((source_id, r.json()['meta']['count']))

12984it [06:21, 34.04it/s]                          


KeyboardInterrupt: 