In [270]:
import pandas as pd
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from tqdm import tqdm
from multiprocess import Process, Manager
from typing import List, Dict
import json
from joblib import delayed, Parallel
import itertools

In [271]:
df = pd.read_csv("aps-dataset-citations-2020.csv")

In [272]:
df.head(5)

Unnamed: 0,citing_doi,cited_doi
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1
1,10.1103/PhysRevSeriesI.12.121,10.1103/PhysRevSeriesI.1.166
2,10.1103/PhysRevSeriesI.7.93,10.1103/PhysRevSeriesI.1.166
3,10.1103/PhysRevSeriesI.16.267,10.1103/PhysRevSeriesI.2.35
4,10.1103/PhysRevSeriesI.17.65,10.1103/PhysRevSeriesI.2.112


In [273]:
def get_metadata(dois: List[str]):
    doi_prefix = "https://doi.org/"
    dois_cat = [doi_prefix + doi for doi in dois]

    dois_queries = []
    max_size = 25
    while dois_cat: 
        chunk, dois_cat = dois_cat[:max_size], dois_cat[max_size:]
        query = "|".join(chunk)
        dois_queries.append(query)

    all_results = []
    for q in dois_queries:
        results = Works().filter(doi=q).get()
        filtered_results = []
        for r in results:
            found_first_author = False
            for fa_idx in range(len(r["authorships"])):
                if r["authorships"][fa_idx]['author_position'] == 'first':
                    found_first_author = True
                    break
                    
            sorted_keywords = [m["keyword"] for m in sorted(r["keywords"], key=lambda x: x['score'], reverse=True)]
            all_authors = [m["raw_author_name"] for m in r["authorships"]]
            sorted_authors_af = [m["raw_affiliation_string"] for m in r["authorships"]] 
            filtered_results.append({
                "doi": str(r["doi"]),
                "id": str(r["id"]),
                "abstract": str(r["abstract"]),
                "title": str(r["title"]),
                "publication_date": str(r["publication_date"]),
                "type": str(r["type"]),
                "type_crossref": str(r["type_crossref"]),
                "keywords": sorted_keywords,
                "first_author": str(r["authorships"][fa_idx]['raw_author_name']) if found_first_author else "",
                "host_organization_name": str(r["primary_location"]["source"]["host_organization_name"]),
                "all_authors": all_authors,
                "all_authors_af": sorted_authors_af,
                "language": r["language"],
            })
        all_results.extend(filtered_results)
    return all_results

In [274]:
#df_sample = df.sample(n=10_000, random_state=0)
unique_dois = df["citing_doi"].tolist() + df["cited_doi"].tolist()
unique_dois = list(set(unique_dois))

In [275]:
print(len(unique_dois))

668383


In [276]:
#res

In [277]:
N = len(unique_dois)
divs = 10
piece_size = int(N / divs) + 1

In [278]:
all_data = []
for div in range(divs):
    unique_dois_piece = unique_dois[div * piece_size:(div + 1) * piece_size]
    
    N_piece = len(unique_dois_piece)
    NUM_JOBS = 4
    job_piece_size = int(N_piece / NUM_JOBS) + 1
    chunks = [unique_dois_piece[job_piece_size * i:job_piece_size * (i + 1)] for i in range(NUM_JOBS)]
    
    metadata = Parallel(n_jobs=NUM_JOBS)(delayed(get_metadata)(d) for d in tqdm(chunks, total=len(chunks)))
    metadata = list(itertools.chain.from_iterable(metadata))
    print(len(metadata))
    all_data.extend(metadata)

100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 875.27it/s]


66838


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 929.18it/s]


66831


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 533.24it/s]


66832


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 5427.76it/s]


66835


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 1140.76it/s]


66837


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 9692.21it/s]


66835


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 1340.25it/s]


66837


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 787.15it/s]


66838


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 1321.87it/s]


66835


100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 4971.03it/s]


66827


In [279]:
df_metadata = pd.DataFrame(all_data)

In [280]:
len(df_metadata["doi"].unique()) #[df_metadata["abstract"] != ""]

668272

In [281]:
df_metadata.to_csv("metadata.csv")

In [287]:
df_metadata.to_pickle("metadata.pkl", compression={'method': 'gzip', 'compresslevel': 3, 'mtime': 1})

In [289]:
df_metadata["title"].value_counts()

title
Editorial: Promoting Inclusive and Respectful Communications                                                                                                                                                                                                                                                                                                                                                                                                                                                            16
Discussion                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [285]:
df_metadata[df_metadata["abstract"].notna()]

Unnamed: 0,doi,id,abstract,title,publication_date,type,type_crossref,keywords,first_author,host_organization_name,all_authors,all_authors_af,language
0,https://doi.org/10.1103/revmodphys.60.701,https://openalex.org/W1969695378,Recent advances in the study of solid surfaces...,"Interaction of positron beams with surfaces, t...",1988-07-01,article,journal-article,"[positron beams, thin films, surfaces, interac...",Peter J. Schultz,American Physical Society,"[Peter J. Schultz, K. G. Lynn]","[Department of Physics, The University of West...",en
1,https://doi.org/10.1103/physrevc.20.820,https://openalex.org/W1976520037,A unified shell-model description of nuclear d...,Unified shell-model description of nuclear def...,1979-08-01,article,journal-article,"[nuclear deformation, shell-model]",P. Federman,American Institute of Physics,"[P. Federman, S. Pittel]","[Instituto de Fisica, Universidad Nacional Aut...",en
2,https://doi.org/10.1103/physrevlett.90.188103,https://openalex.org/W2032865683,Counterion atmospheres condensed onto charged ...,Counterion Distribution around DNA Probed by S...,2003-05-08,article,journal-article,"[dna probed, scattering, x-ray]",R. Das,American Physical Society,"[R. Das, T. T. Mills, L. W. Kwok, G. S. Maskel...","[Department of Biochemistry, Stanford Universi...",en
3,https://doi.org/10.1103/physrevd.72.025006,https://openalex.org/W1976726898,We present an extension of the twistor-motivat...,Twistor-inspired construction of electroweak v...,2005-07-13,article,journal-article,[twistor-inspired],Zvi Bern,American Physical Society,"[Zvi Bern, Darren Forde, David A. Kosower, Pie...","[Department of Physics and Astronomy, UCLA, Lo...",en
4,https://doi.org/10.1103/physrevstab.6.072001,https://openalex.org/W1974569415,We present measurements of dark currents and x...,"Dark current, breakdown, and magnetic field ef...",2003-07-30,article,journal-article,"[mhz cavity, magnetic field effects, magnetic ...",J. Norem,American Physical Society,"[J. Norem, V. Wu, A. Moretti, M. Popovic, Z. Q...","[HEP Division, Argonne National Laboratory, Ar...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
668340,https://doi.org/10.1103/physrev.74.1315,https://openalex.org/W1988389384,Simultaneous alpha-particle bombardments of ${...,Radioactive Arsenic Isotopes,1948-11-15,article,journal-article,"[isotopes, arsenic]",D.A. McCown,American Institute of Physics,"[D.A. McCown, L.L. Woodward, M.L. Pool]","[THE OHIO STATE UNIVERSITY COLUMBUS, OHIO, THE...",en
668341,https://doi.org/10.1103/physrevd.91.064031,https://openalex.org/W2048986354,We prove that adiabatic regularization and DeW...,Equivalence of adiabatic and DeWitt-Schwinger ...,2015-03-11,article,journal-article,[dewitt-schwinger],Adrián del Río,American Physical Society,"[Adrián del Río, José Navarro-Salas]","[Departamento de Fisica Teorica, IFIC, Centro ...",en
668342,https://doi.org/10.1103/physrevlett.61.2179,https://openalex.org/W2089423254,A laboratory experiment searching for intermed...,Limit on the Strength of Intermediate-Range Fo...,1988-11-07,article,journal-article,"[isospin, forces, intermediate-range]",R. Cowsik,American Physical Society,"[R. Cowsik, N. Krishnan, S. N. Tandon, C. S. U...","[Tata Institute of Fundamental Research, Bomba...",en
668343,https://doi.org/10.1103/physrevb.86.184506,https://openalex.org/W2062382806,"Striped phases, in which spin, charge, and pai...",Determinant quantum Monte Carlo study of the e...,2012-11-07,article,journal-article,"[quantum, determinant]",Rubem Mondaini,American Physical Society,"[Rubem Mondaini, Tao Ying, Thereza Paiva, Rich...","[Instituto de Fisica, Universidade Federal do ...",en
