In [2]:
import pandas as pd
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from tqdm import tqdm
from multiprocess import Process, Manager
from typing import List, Dict
import json
from joblib import delayed, Parallel
import itertools

In [3]:
df = pd.read_csv("aps-dataset-citations-2020.csv")

In [4]:
df.head(5)

Unnamed: 0,citing_doi,cited_doi
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1
1,10.1103/PhysRevSeriesI.12.121,10.1103/PhysRevSeriesI.1.166
2,10.1103/PhysRevSeriesI.7.93,10.1103/PhysRevSeriesI.1.166
3,10.1103/PhysRevSeriesI.16.267,10.1103/PhysRevSeriesI.2.35
4,10.1103/PhysRevSeriesI.17.65,10.1103/PhysRevSeriesI.2.112


In [6]:
len(df)

8850333

In [40]:
def get_metadata(dois: List[str], is_id=False):
    dois_queries = []
    max_size = 25
    
    if is_id == False:
        doi_prefix = "https://doi.org/"
        dois_cat = [doi_prefix + doi for doi in dois]
        while dois_cat: 
            chunk, dois_cat = dois_cat[:max_size], dois_cat[max_size:]
            query = "|".join(chunk)
            dois_queries.append(query)
    else:
        while dois: 
            chunk, dois = dois[:max_size], dois[max_size:]
            dois_queries.append(chunk)

    all_results = []
    for q in dois_queries:
        if is_id == False:
            results = Works().filter(doi=q).get()
        else:
            results = Works()[q]
        filtered_results = []
        for r in results:
            try:
                found_first_author = False
                for fa_idx in range(len(r["authorships"])):
                    if r["authorships"][fa_idx]['author_position'] == 'first':
                        found_first_author = True
                        break
                        
                sorted_keywords = [m["keyword"] for m in sorted(r["keywords"], key=lambda x: x['score'], reverse=True)]
                all_authors = [m["raw_author_name"] for m in r["authorships"]]
                sorted_authors_af = [m["raw_affiliation_string"] for m in r["authorships"]] 
                filtered_results.append({
                    "doi": str(r["doi"]),
                    "id": str(r["id"]),
                    "abstract": str(r["abstract"]),
                    "title": str(r["title"]),
                    "publication_date": str(r["publication_date"]),
                    "type": str(r["type"]),
                    "type_crossref": str(r["type_crossref"]),
                    "keywords": sorted_keywords,
                    "first_author": str(r["authorships"][fa_idx]['raw_author_name']) if found_first_author else "",
                    #"host_organization_name": str(r["primary_location"]["source"]["host_organization_name"]) if "source" in r["primary_location"] and ,
                    "all_authors": all_authors,
                    "all_authors_af": sorted_authors_af,
                    "language": r["language"],
                    "referenced_works": r["referenced_works"],
                    "referenced_works_count": r["referenced_works_count"],
                    "related_works": r["related_works"],
                    "concepts": [(c["display_name"], c["score"]) for c in r["concepts"]] if r["concepts"] else [],
                    "is_accepted": r["primary_location"]["is_accepted"],
                    "is_published": r["primary_location"]["is_published"],
                })
            except:
                pass
        all_results.extend(filtered_results)
    return all_results

In [26]:
#df_sample = df.sample(n=10_000, random_state=0)
unique_dois = df["citing_doi"].tolist() + df["cited_doi"].tolist()
unique_dois = list(set(unique_dois))

In [9]:
print(len(unique_dois))

668383


In [276]:
#res

In [39]:
N = len(unique_dois)
divs = 10
piece_size = int(N / divs) + 1

In [41]:
all_data = []
for div in range(divs):
    unique_dois_piece = unique_dois[div * piece_size:(div + 1) * piece_size]
    
    N_piece = len(unique_dois_piece)
    NUM_JOBS = 4
    job_piece_size = int(N_piece / NUM_JOBS) + 1
    chunks = [unique_dois_piece[job_piece_size * i:job_piece_size * (i + 1)] for i in range(NUM_JOBS)]
    
    metadata = Parallel(n_jobs=NUM_JOBS)(delayed(get_metadata)(d) for d in tqdm(chunks, total=len(chunks)))
    metadata = list(itertools.chain.from_iterable(metadata))
    print(len(metadata))
    all_data.extend(metadata)

100%|██████████████████████████████████| 4/4 [00:00<00:00, 133.42it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 5845.72it/s]


66836


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6150.01it/s]


66834


100%|█████████████████████████████████| 4/4 [00:00<00:00, 1136.05it/s]


66831


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8069.85it/s]


66838


100%|██████████████████████████████████| 4/4 [00:00<00:00, 297.07it/s]


66834


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8966.98it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6574.14it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6538.28it/s]


66837


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8890.95it/s]


66828


In [42]:
df_metadata = pd.DataFrame(all_data)

In [47]:
len(df_metadata["doi"].unique()) #[df_metadata["abstract"] != ""]

668263

In [44]:
df_metadata.to_csv("metadata.csv")

In [45]:
df_metadata.to_pickle("metadata.pkl", compression={'method': 'gzip', 'compresslevel': 3, 'mtime': 1})

In [289]:
df_metadata["title"].value_counts()

title
Editorial: Promoting Inclusive and Respectful Communications                                                                                                                                                                                                                                                                                                                                                                                                                                                            16
Discussion                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [46]:
df_metadata[df_metadata["abstract"].notna()]

Unnamed: 0,doi,id,abstract,title,publication_date,type,type_crossref,keywords,first_author,all_authors,all_authors_af,language,referenced_works,referenced_works_count,related_works,concepts,is_accepted,is_published
0,https://doi.org/10.1103/physrevlett.26.192,https://openalex.org/W2050313003,Tunneling measurements on junctions between ve...,Spin-Dependent Tunneling into Ferromagnetic Ni...,1971-01-25,article,journal-article,"[nickel, spin-dependent]",P. M. Tedrow,"[P. M. Tedrow, R. Meservey]","[Francis Bitter National Magnet Laboratory, Ma...",en,"[https://openalex.org/W1966010108, https://ope...",12,"[https://openalex.org/W1998765608, https://ope...","[(Condensed matter physics, 0.8544545), (Quant...",False,False
1,https://doi.org/10.1103/physreve.92.012819,https://openalex.org/W1758342214,Inspired by the fact that people have diverse ...,Competition and cooperation among different pu...,2015-07-27,article,journal-article,"[different punishing strategies, competition, ...",Xiaojie Chen,"[Xiaojie Chen, Attila Szolnoki, Matjaž Perc]","[School of Mathematical Sciences, University o...",en,"[https://openalex.org/W748497126, https://open...",43,"[https://openalex.org/W2996115615, https://ope...","[(Public goods game, 0.79590267), (Competition...",False,False
2,https://doi.org/10.1103/physrev.123.968,https://openalex.org/W2040287172,The scattering matrix for compound nucleus pro...,Theory of Average Neutron Reaction Cross Secti...,1961-08-01,article,journal-article,"[cross sections, resonance]",P.A. Moldauer,[P.A. Moldauer],"[Argonne National Laboratory, Argonne, Illinois]",en,"[https://openalex.org/W1965366397, https://ope...",19,"[https://openalex.org/W2063585241, https://ope...","[(Physics, 0.75732374), (Resonance (particle p...",False,False
3,https://doi.org/10.1103/physrevlett.86.4536,https://openalex.org/W1973501299,"The instability, dynamics, and morphological t...",Templating of Thin Films Induced by Dewetting ...,2001-05-14,article,journal-article,"[thin films, surfaces]",Kajari Kargupta,"[Kajari Kargupta, Ashutosh Sharma]","[Department of Chemical Engineering, Indian In...",en,"[https://openalex.org/W1514074674, https://ope...",23,"[https://openalex.org/W2236704256, https://ope...","[(Dewetting, 0.96603024), (Instability, 0.7001...",False,False
4,https://doi.org/10.1103/physrevlett.105.235502,https://openalex.org/W1535924615,The energy of arbitrary graphene edge is deriv...,Graphene Edge from Armchair to Zigzag: The Ori...,2010-12-03,article,journal-article,"[nanotube chirality, graphene, edge]",Yuanyue Liu,"[Yuanyue Liu, Alex Dobrinsky, Boris I. Yakobson]",[Department of Mechanical Engineering & Materi...,en,"[https://openalex.org/W1992192511, https://ope...",25,"[https://openalex.org/W2318938206, https://ope...","[(Zigzag, 0.89218235), (Graphene, 0.8799213), ...",False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668332,https://doi.org/10.1103/physrevc.91.055802,https://openalex.org/W2157950109,In the standard Big-Bang nucleosynthesis (BBN)...,Revised thermonuclear rate of<mml:math xmlns:m...,2015-05-08,article,journal-article,"[thermonuclear rate, of<mmlmath xmlnsmml=http/...",S. Q. Hou,"[S. Q. Hou, J. J. He, S. Kubono, Y. S. Chen]",[Key Laboratory of High Precision Nuclear Spec...,en,"[https://openalex.org/W1548963646, https://ope...",43,"[https://openalex.org/W2935759653, https://ope...","[(Nucleosynthesis, 0.6656209), (Physics, 0.634...",False,False
668333,https://doi.org/10.1103/physrev.184.1052,https://openalex.org/W2332752927,Angular distributions have been measured for t...,"Structure of<mml:math xmlns:mml=""http://www.w3...",1969-08-20,article,journal-article,[of<mmlmath xmlnsmml=http//wwww3org/1998/math/...,G. W. Phillips,"[G. W. Phillips, W. W. Jacobs]","[Department of Physics, University of Washingt...",en,"[https://openalex.org/W1619847663, https://ope...",29,"[https://openalex.org/W2068137917, https://ope...","[(Physics, 0.6926385), (Excited state, 0.62857...",False,False
668334,https://doi.org/10.1103/physrevb.60.11321,https://openalex.org/W2266492108,Infrared reflectance spectroscopy measurements...,"Optical properties of<mml:math xmlns:mml=""http...",1999-10-15,article,journal-article,[optical properties],S. V. Dordevic,"[S. V. Dordevic, N. R. Dilley, E. D. Bauer, D....",[Department of Physics University of Californi...,en,"[https://openalex.org/W1543827524, https://ope...",24,"[https://openalex.org/W2356841320, https://ope...","[(Context (archaeology), 0.5357055), (Physics,...",False,False
668335,https://doi.org/10.1103/physreve.81.061123,https://openalex.org/W2080010785,Lipid monolayers have been shown to represent ...,Thermomechanic-electrical coupling in phosphol...,2010-06-15,article,journal-article,"[phospholipid monolayers, thermomechanic-elect...",D. Steppich,"[D. Steppich, J. Griesbauer, T. Frommelt, W. A...","[Experimental Physics I, University of Augsbur...",en,"[https://openalex.org/W1984778795, https://ope...",17,"[https://openalex.org/W1967878322, https://ope...","[(Heat capacity, 0.70857465), (Compressibility...",False,False
