In [2]:
import pandas as pd
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from tqdm import tqdm
from multiprocess import Process, Manager
from typing import List, Dict
import json
from joblib import delayed, Parallel
import itertools

In [3]:
df = pd.read_csv("aps-dataset-citations-2020.csv")

In [4]:
df.head(5)

Unnamed: 0,citing_doi,cited_doi
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1
1,10.1103/PhysRevSeriesI.12.121,10.1103/PhysRevSeriesI.1.166
2,10.1103/PhysRevSeriesI.7.93,10.1103/PhysRevSeriesI.1.166
3,10.1103/PhysRevSeriesI.16.267,10.1103/PhysRevSeriesI.2.35
4,10.1103/PhysRevSeriesI.17.65,10.1103/PhysRevSeriesI.2.112


In [6]:
len(df)

8850333

In [40]:
def get_metadata(dois: List[str], is_id=False):
    dois_queries = []
    max_size = 25
    
    if is_id == False:
        doi_prefix = "https://doi.org/"
        dois_cat = [doi_prefix + doi for doi in dois]
        while dois_cat: 
            chunk, dois_cat = dois_cat[:max_size], dois_cat[max_size:]
            query = "|".join(chunk)
            dois_queries.append(query)
    else:
        while dois: 
            chunk, dois = dois[:max_size], dois[max_size:]
            dois_queries.append(chunk)

    all_results = []
    for q in dois_queries:
        if is_id == False:
            results = Works().filter(doi=q).get()
        else:
            results = Works()[q]
        filtered_results = []
        for r in results:
            try:
                found_first_author = False
                for fa_idx in range(len(r["authorships"])):
                    if r["authorships"][fa_idx]['author_position'] == 'first':
                        found_first_author = True
                        break
                        
                sorted_keywords = [m["keyword"] for m in sorted(r["keywords"], key=lambda x: x['score'], reverse=True)]
                all_authors = [m["raw_author_name"] for m in r["authorships"]]
                sorted_authors_af = [m["raw_affiliation_string"] for m in r["authorships"]] 
                filtered_results.append({
                    "doi": str(r["doi"]),
                    "id": str(r["id"]),
                    "abstract": str(r["abstract"]),
                    "title": str(r["title"]),
                    "publication_date": str(r["publication_date"]),
                    "type": str(r["type"]),
                    "type_crossref": str(r["type_crossref"]),
                    "keywords": sorted_keywords,
                    "first_author": str(r["authorships"][fa_idx]['raw_author_name']) if found_first_author else "",
                    #"host_organization_name": str(r["primary_location"]["source"]["host_organization_name"]) if "source" in r["primary_location"] and ,
                    "all_authors": all_authors,
                    "all_authors_af": sorted_authors_af,
                    "language": r["language"],
                    "referenced_works": r["referenced_works"],
                    "referenced_works_count": r["referenced_works_count"],
                    "related_works": r["related_works"],
                    "concepts": [(c["display_name"], c["score"]) for c in r["concepts"]] if r["concepts"] else [],
                    "is_accepted": r["primary_location"]["is_accepted"],
                    "is_published": r["primary_location"]["is_published"],
                })
            except:
                pass
        all_results.extend(filtered_results)
    return all_results

In [26]:
#df_sample = df.sample(n=10_000, random_state=0)
unique_dois = df["citing_doi"].tolist() + df["cited_doi"].tolist()
unique_dois = list(set(unique_dois))

In [9]:
print(len(unique_dois))

668383


In [1]:
def parallel_openalex_queries(query_list, divs, is_id=False):
    N = len(query_list)
    piece_size = int(N / divs) + 1
    NUM_JOBS = 4
    all_data = []
    for div in range(divs):
        query_list_piece = query_list[div * piece_size:(div + 1) * piece_size]
        
        N_piece = len(query_list_piece)
        
        job_piece_size = int(N_piece / NUM_JOBS) + 1
        chunks = [query_list_piece[job_piece_size * i:job_piece_size * (i + 1)] for i in range(NUM_JOBS)]
        
        metadata = Parallel(n_jobs=NUM_JOBS)(delayed(get_metadata)(d, is_id=is_id) for d in tqdm(chunks, total=len(chunks)))
        metadata = list(itertools.chain.from_iterable(metadata))
        print(len(metadata))
        all_data.extend(metadata)
    return all_data

In [41]:
all_data = parallel_openalex_queries(unique_dois, 10, is_id=False)

100%|██████████████████████████████████| 4/4 [00:00<00:00, 133.42it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 5845.72it/s]


66836


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6150.01it/s]


66834


100%|█████████████████████████████████| 4/4 [00:00<00:00, 1136.05it/s]


66831


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8069.85it/s]


66838


100%|██████████████████████████████████| 4/4 [00:00<00:00, 297.07it/s]


66834


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8966.98it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6574.14it/s]


66833


100%|█████████████████████████████████| 4/4 [00:00<00:00, 6538.28it/s]


66837


100%|█████████████████████████████████| 4/4 [00:00<00:00, 8890.95it/s]


66828


In [None]:
extra_refs = []
for d in all_data:
    try:
        extra_refs.extend(eval(d["referenced_works"]))
    except:
        pass
extra_refs = list(set(extra_refs) - set([x["id"] for x in all_data]))

In [None]:
extended_data = parallel_openalex_queries(extra_refs, 20, is_id=True)

In [42]:
df_metadata = pd.DataFrame(list(set(all_data + extended_data)))

In [47]:
len(df_metadata["doi"].unique()) #[df_metadata["abstract"] != ""]

668263

In [44]:
df_metadata.to_csv("metadata.csv")

In [45]:
df_metadata.to_pickle("metadata.pkl", compression={'method': 'gzip', 'compresslevel': 3, 'mtime': 1})