# update1 post-mortem

In [1]:
import sys

In [2]:
sys.path.append('../..')

In [52]:
from datetime import datetime, timezone
import json
from tqdm import tqdm
from pathlib import Path
import gzip

In [4]:
import pandas as pd
import numpy as np

In [5]:
from app import db
from models import Source
from sqlalchemy import text
from sqlalchemy.orm import Load
from sqlalchemy.exc import MultipleResultsFound

139992277139840: loading merged_into_institutions_dict
139992277139840: loading valid concept IDs
139992277139840: loading valid topic IDs


In [6]:
sq = """select * from issn_audit_20240321.update1"""
df_update1 = pd.read_sql_query(sq, db.engine)

In [7]:
df_update1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127588 entries, 0 to 127587
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   issnl             127588 non-null  object        
 1   old_issns         127588 non-null  object        
 2   new_issns         127588 non-null  object        
 3   old_issnl         127588 non-null  object        
 4   old_display_name  127588 non-null  object        
 5   new_display_name  127588 non-null  object        
 6   updated_date      127588 non-null  datetime64[ns]
 7   source_id         127588 non-null  int64         
 8   old_type          127583 non-null  object        
 9   new_type          127588 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 9.7+ MB


## revert changes to source type; shouldn't have done that

In [8]:
def get_source_from_db_by_source_id(source_id: int, session) -> Source:
    source = session.query(Source).options(Load(Source).lazyload('*')).filter_by(journal_id=source_id).one_or_none()
    return source


In [10]:
df_subset_revert_type = df_update1[df_update1['old_type']!=df_update1['new_type']]
len(df_subset_revert_type)

7871

In [12]:
df_subset_revert_type['source_id'].duplicated().sum()

0

In [13]:
df_subset_revert_type = df_subset_revert_type.set_index('source_id')[['old_type', 'new_type']]

In [14]:
df_subset_revert_type

Unnamed: 0_level_0,old_type,new_type
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4210189819,book series,journal
4210192866,book series,journal
4210230996,ebook platform,journal
49489055,book series,journal
188676343,book series,journal
...,...,...
4210203369,conference,journal
4210222037,conference,journal
4220651191,conference,journal
4210187508,book series,journal


In [15]:
# update db
try:
    for source_id, row in tqdm(df_subset_revert_type.iterrows(), total=len(df_subset_revert_type)):
        updated_date = datetime.now(timezone.utc).isoformat()

        # make updates
        source = get_source_from_db_by_source_id(source_id, db.session)
        old_type = source.type
        new_type = row['old_type']
        source.type = new_type
        source.updated_date = updated_date
        db.session.add(source)

            # insert into log table
        sq = """INSERT INTO issn_audit_20240321.update1_postmortem_revert_type
            (source_id, old_type, new_type, updated_date)
            VALUES(:source_id, :old_type, :new_type, :updated_date);
            """
        db.session.execute(text(sq), {
            'source_id': source.id,
            'old_type': old_type,
            'new_type': new_type,
            'updated_date': updated_date,
        }) 

        # commit for each source_id
        db.session.commit()
finally:
    db.session.close()

100%|██████████| 7871/7871 [20:02<00:00,  6.55it/s]


In [16]:
vc = df_update1['source_id'].value_counts()
(vc>1).value_counts()

count
False    125275
True       1150
Name: count, dtype: int64

In [17]:
df_update1[df_update1['source_id'].isin(vc[vc>1].index)].to_csv('../data/tmpupdate1dupsourceids.csv')

In [19]:
%%time
sq = """select * from issn_audit_20240321.issn_ic_datafile_202402"""
df_datafile = pd.read_sql_query(sq, db.engine)

CPU times: user 1.02 s, sys: 223 ms, total: 1.25 s
Wall time: 2.44 s


In [20]:
df_datafile

Unnamed: 0,index,submitted_1348-0278,issn,issnl,category,medium,frequency,country,center,language,start,end,title,comment
0,110805,0000-0205,0000-0205,0000-0205,Register,Print,Biennial,UNITED STATES,United States,English,1968,2014,Who's who in American politics,
1,193047,0000-1112,0000-1112,0000-1112,Register,Print,Annual,UNITED STATES,United States,English,1984,1988,Audio video market place,
2,114080,0000-992X,0000-992X,,Legacy,,No attempt to code,LEGACY_BOW,International,,||||,||||,,
3,116440,0001-0197,0001-0197,0001-0197,Register,Print,No determinable frequency,CANADA,Canada,English,1945,1998,A A R N newsletter,
4,29546,0001-026X,0001-026X,0001-026X,Register,Print,Quarterly,UNITED STATES,United States,English,1956,1978,AAUP bulletin,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214601,117732,8756-8926,8756-8926,8756-8926,Register,Print,Semiannual,UNITED STATES,United States,English,1983,1987,New York Law School human rights annual,
214602,55656,8756-9264,8756-9264,8756-9264,Register,Print,Bimonthly,UNITED STATES,United States,English,1985,1986,Clinical progress in electrophysiology and pacing,
214603,35716,8756-971X,8756-971X,8756-971X,Register,Print,Quarterly,UNITED STATES,United States,English,1985,9999,Journal of the American Mosquito Control Assoc...,
214604,150972,8756-9728,8756-9728,1938-9507,Register,Print,Quarterly,UNITED STATES,United Kingdom,English,1984,9999,Project management journal,


In [21]:
# get dict of issnl to issn
x = df_datafile.dropna(subset=['issnl'])
x = x[x['category'].isin(['Register', 'Work', 'Free', 'Validation Request'])]
x = x[['issn', 'issnl']]
issnl_to_issn = {}
ignore = [
    '0000-0000',
]
x = x[~(x['issnl'].isin(ignore))]
for name, gbdf in x.groupby('issnl'):
    if name in ignore:
        continue
    issnl_to_issn[name] = gbdf['issn'].tolist()
len(issnl_to_issn)

140984

In [22]:
num_hits = {
    issnl: sum(issn in issnl_to_issn.keys() for issn in v)
    for issnl, v in issnl_to_issn.items()
}

In [23]:
num_hits = pd.Series(num_hits)

In [24]:
num_hits[num_hits==0]

0012-7477    0
0047-1321    0
0069-8458    0
0074-4433    0
0074-4441    0
            ..
3009-612X    0
3009-7320    0
3009-7487    0
3009-7797    0
3025-6984    0
Length: 860, dtype: int64

In [28]:
issnl_to_issn['3009-7487']

['3009-7886']

In [53]:
# get sources from snapshot
datadir = Path('/mnt/d/openalex-snapshot-20240227/data/sources/')
jsonl_files = list(datadir.rglob('*.gz'))
sources = {}
for fp in tqdm(jsonl_files):
    with gzip.open(fp, 'r') as sources_jsonl:
        for source_json in sources_jsonl:
            if not source_json.strip():
                continue

            source = json.loads(source_json)
            source_id = int(source['id'].lower().replace('https://openalex.org/s', ''))
            sources[source_id] = source

100%|██████████| 8/8 [00:22<00:00,  2.77s/it]


In [25]:
dupsource = df_update1[df_update1['source_id'].isin(vc[vc>1].index)]

In [31]:
dupsource.sort_index().sort_values('source_id')

Unnamed: 0,issnl,old_issns,new_issns,old_issnl,old_display_name,new_display_name,updated_date,source_id,old_type,new_type
11343,0178-4919,"[""0022-3476"", ""1085-8695"", ""1097-6833""]","[""0178-4919""]",0022-3476,The Journal of pediatrics,Zeitschrift für Kinderheilkunde. Originalien,2024-03-30 20:23:04.615623,3774022,journal,journal
3333,0022-3476,"[""1085-8695"", ""0178-4919"", ""0022-3476"", ""1097-...","[""0022-3476"", ""1085-8695"", ""1097-6833""]",0022-3476,The Journal of Pediatrics,The Journal of pediatrics,2024-03-30 20:23:04.615623,3774022,journal,journal
17365,0375-9865,"[""0038-092X"", ""1471-1257""]","[""0375-9865""]",0038-092X,Solar energy (Print),Journal of Solar Energy Science and Engineering,2024-03-30 20:23:04.615623,5745031,journal,journal
5660,0038-092X,"[""0375-9865"", ""1471-1257"", ""0038-092X""]","[""0038-092X"", ""1471-1257""]",0038-092X,Solar Energy,Solar energy (Print),2024-03-30 20:23:04.615623,5745031,journal,journal
31853,1166-8253,"[""0761-8425"", ""1776-2588""]","[""1166-8253""]",0761-8425,Revue des maladies respiratoires,Revue des maladies respiratoires. Supplément,2024-03-30 20:23:04.615623,6672359,journal,journal
...,...,...,...,...,...,...,...,...,...,...
126486,2988-7062,"[""2252-3642""]","[""2988-7062""]",2252-3642,Jurnal Health Society,Jurnal Health Society (online),2024-03-30 20:23:04.615623,4390963349,journal,journal
127481,3031-0636,"[""3026-2550""]","[""3031-0636""]",3026-2550,Jurnal Konatif (Online),Jurnal Konatif,2024-03-30 20:23:04.615623,4390963368,journal,journal
127318,3026-2550,"[""3031-0636"", ""3026-2550""]","[""3026-2550""]",3031-0636,Jurnal Konatif Jurnal Ilmiah Pendidikan,Jurnal Konatif (Online),2024-03-30 20:23:04.615623,4390963368,journal,journal
124649,2964-0598,"[""2963-0150""]","[""2964-0598""]",2963-0150,Medical Journal of Nusantara,Medical Journal of Nusantara (Online),2024-03-30 20:23:04.615623,4390963369,journal,journal


In [43]:
counts = dupsource.groupby('source_id').size().sort_values()
counts

source_id
3774022       2
4387288362    2
4387288364    2
4387288500    2
4387288516    2
             ..
119771102     3
193215455     3
148074306     3
4210191320    4
4210222880    5
Length: 1150, dtype: int64

In [47]:
dupsource[dupsource['source_id']==4210191320]

Unnamed: 0,issnl,old_issns,new_issns,old_issnl,old_display_name,new_display_name,updated_date,source_id,old_type,new_type
26212,0999-9809,"[""2606-6645"",""2649-7611"",""0999-9809"",""2649-7395""]","[""0999-9809""]",0999-9809,"Journal de médecine légale, droit médical, vic...","Journal de médecine légale, droit médical, vic...",2024-03-30 20:23:04.615623,4210191320,journal,journal
99816,2606-6645,"[""0999-9809""]","[""2606-6645""]",0999-9809,"Journal de médecine légale, droit médical, vic...","Droit, santé et société",2024-03-30 20:23:04.615623,4210191320,journal,journal
104090,2649-7395,"[""2606-6645""]","[""2649-7395""]",2606-6645,"Droit, santé et société","Conflits, catastrophes, situations humanitaires",2024-03-30 20:23:04.615623,4210191320,journal,journal
104091,2649-7611,"[""2649-7395""]","[""2649-7611""]",2649-7395,"Conflits, catastrophes, situations humanitaires",Criminalistique,2024-03-30 20:23:04.615623,4210191320,journal,journal


In [56]:
try:
    for source_id, gbdf in tqdm(dupsource.groupby('source_id'), total=dupsource['source_id'].nunique()):
        updated_date = datetime.now(timezone.utc).isoformat()

        # revert_row = gbdf.sort_index().iloc[0]

        source_snapshot = sources[source_id]
        source = get_source_from_db_by_source_id(source_id, db.session)
        if source is None:
            raise KeyError(f"issnl {issnl} not found in db")
        old_issnl = source.issn
        old_display_name = source.display_name
        old_issns = source.issns
        new_issnl = source_snapshot['issn_l']
        new_issns = source_snapshot['issn']
        new_display_name = source_snapshot['display_name']

        # make updates
        source.display_name = new_display_name
        source.issn = new_issnl
        source.issns = json.dumps(new_issns)
        source.issns_text_array = new_issns
        source.updated_date = updated_date
        db.session.add(source)

        # insert into log table
        sq = """INSERT INTO issn_audit_20240321.update1_postmortem_revert_dups
            (source_id, old_issnl, new_issnl, old_issns, new_issns, old_display_name, new_display_name, updated_date)
            VALUES(:source_id, :old_issnl, :new_issnl, :old_issns, :new_issns, :old_display_name, :new_display_name, :updated_date);
            """
        db.session.execute(text(sq), {
            'source_id': source_id,
            'old_issnl': old_issnl,
            'new_issnl': new_issnl,
            'old_issns': old_issns,
            'new_issns': json.dumps(new_issns),
            'old_display_name': old_display_name,
            'new_display_name': new_display_name,
            'updated_date': updated_date,
        }) 

        # commit for each source_id
        db.session.commit()
finally:
    db.session.close()

100%|██████████| 1150/1150 [02:21<00:00,  8.11it/s]


In [74]:
%%time
sq = """select * from issn_audit_20240321.issn_ic_datafile_202402"""
df_datafile = pd.read_sql_query(sq, db.engine)

CPU times: user 444 ms, sys: 163 ms, total: 607 ms
Wall time: 2.37 s


In [75]:
df_unresolved = df_datafile[df_datafile['resolved']==False]
len(df_unresolved)

21053

categories of unresolved:

1. ISSN-L gives multiple sources in OpenAlex (`issn_audit_20240321.update1_issnl_multfound`)
2. ISSN-L gives no sources in OpenAlex (`issn_audit_20240321.update1_issnl_notfound`)
3. multiple ISSN-Ls give the same source in OpenAlex (`issn_audit_20240321.update1.reverted is true`)

In [76]:
%%time
sq1 = """select issnl from issn_audit_20240321.update1_issnl_multfound"""
df_cat1 = pd.read_sql_query(sq1, db.engine)
sq2 = """select issnl from issn_audit_20240321.update1_issnl_notfound"""
df_cat2 = pd.read_sql_query(sq2, db.engine)
sq3 = """select issnl from issn_audit_20240321.update1 where reverted is true"""
df_cat3 = pd.read_sql_query(sq3, db.engine)


CPU times: user 10.4 ms, sys: 9.9 ms, total: 20.3 ms
Wall time: 761 ms


In [77]:
print(df_cat1['issnl'].duplicated().sum())
print(df_cat2['issnl'].duplicated().sum())
print(df_cat3['issnl'].duplicated().sum())

0
0
0


In [78]:
df_unresolved['cat1'] = df_unresolved['issnl'].isin(df_cat1['issnl'].values)
df_unresolved['cat2'] = df_unresolved['issnl'].isin(df_cat2['issnl'].values)
df_unresolved['cat3'] = df_unresolved['issnl'].isin(df_cat3['issnl'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unresolved['cat1'] = df_unresolved['issnl'].isin(df_cat1['issnl'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unresolved['cat2'] = df_unresolved['issnl'].isin(df_cat2['issnl'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unresolved['cat3'] = df_unresolved['issnl'].isin

In [79]:
df_unresolved['num_cats'] = df_unresolved.apply(lambda x: sum([x['cat1'], x['cat2'], x['cat3']]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unresolved['num_cats'] = df_unresolved.apply(lambda x: sum([x['cat1'], x['cat2'], x['cat3']]), axis=1)


In [80]:
df_unresolved['num_cats'].value_counts()

num_cats
1    19914
0     1139
Name: count, dtype: int64

In [81]:
for c in ['cat1', 'cat2', 'cat3']:
    print(sum(df_unresolved[c]))

7707
9666
2541
