In [1]:
import tqdm
import re
import pandas as pd
import math
import time

# Collect focal papers

In [2]:
def get_abstract(words):
    n = max({idx  for name in words for idx in words[name]})+1
    temp_list = [' ']*n
    for word in words:
        idxs = words[word]
        for idx in idxs:
            temp_list[idx] = word
    abstract = ' '.join(temp_list)
    return abstract

def get_basic_infos(doc):
    list_concepts = [concept['id'] for concept in doc['concepts']]
    Source = None
    if doc['primary_location']:
        if 'source' in doc['primary_location']:
            if doc['primary_location']['source']:
                if 'id' in doc['primary_location']['source']:
                    Source = int(re.sub("https://openalex.org/S","",doc['primary_location']['source']['id']))
    infos = {'id':re.sub("https://openalex.org/","",doc['id']),
    'doi':doc['doi'],
    'year':doc['publication_year'],
    'language':doc['language'],
    'type':doc['type'],
    'source':Source,
    'nb_auth':len(doc['authorships']),
    'auth_ids':'; '.join([re.sub("https://openalex.org/A","",auth['author']['id']) for auth in doc['authorships'] if auth['author']['id']]),
    'countries' : '; '.join([auth['countries'][0] for auth in doc['authorships'] if auth['countries']]),
    'institutions' : '; '.join([auth['institutions'][0]['id'] for auth in doc['authorships'] if auth['institutions']]),
    'nb_citations': doc['cited_by_count'],
    'title': doc['title'],
    'abstract': get_abstract(doc['abstract_inverted_index']) if doc['abstract_inverted_index'] else None,
    'nb_ref':doc['referenced_works_count'] ,
    'concepts': '; '.join(list_concepts),
    'references': '; '.join(doc['referenced_works'])}
    
    return infos

def get_nb_pages(url, limit=50):
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        count = data['meta']['count']
        nb_pages = math.ceil(count/limit)
        print(f"Number of articles matched: {count}")
        print(f"Number of pages: {nb_pages}")
    return nb_pages

def get_articles(url,cursor = False,limit = 100):
    if cursor:
        url += f"&cursor={cursor}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()    
        articles = data['results']
        if cursor:
            next_cursor = data['meta']['next_cursor']
            return articles, next_cursor
        else:
            return articles
    else:
        print("Failed to retrieve data:", response.status_code)
        return None

import requests
import json

def get_url(search, time_windows, concept_id, limit=100):
    url = "https://api.openalex.org/works?"
    if search:
        search = search.replace(' ', '+')
        title_search = f"search={search}"
        url += title_search
   
    if time_windows or concept_id:
        if search:
            filter_ = '&filter='
        else:
            filter_ = 'filter='
        url += filter_
        
        if time_windows:
            filter_year = f"publication_year:{time_windows}"
            url += filter_year
        if concept_id:
            if time_windows:
                url += ','
            filter_concept = f"concepts.id:{concept_id}"
            url += filter_concept
    url += "&mailto=s.bianchini@unistra.fr"
    url += f"&per-page={limit}"
    return url

In [3]:
def get_concept_id(query):
    query = query.replace(' ', '+')
    url = f"https://api.openalex.org/concepts?search={query}&per-page=200"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        print(f"Number of concepts with {query} in display_name: {data['meta']['count']}")
        concepts = data['results']
        return concepts
    else:
        print("Failed to retrieve data:", response.status_code)
        return None
    
    
concepts = get_concept_id('labour economics')
pd.DataFrame(concepts)

Number of concepts with labour+economics in display_name: 1


Unnamed: 0,id,wikidata,display_name,relevance_score,level,description,works_count,cited_by_count,summary_stats,ids,image_url,image_thumbnail_url,international,ancestors,related_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/C145236788,https://www.wikidata.org/wiki/Q28161,Labour economics,116484.914,1,functioning and dynamics of the markets for la...,569577,6498058,"{'2yr_mean_citedness': 1.1874455353511881, 'h_...",{'openalex': 'https://openalex.org/C145236788'...,,,"{'display_name': {'ar': 'اقتصاديات العمل', 'as...","[{'id': 'https://openalex.org/C162324750', 'wi...","[{'id': 'https://openalex.org/C162324750', 'wi...","[{'year': 2024, 'works_count': 8457, 'cited_by...",https://api.openalex.org/works?filter=concepts...,2024-09-01T12:17:38.151583,2016-06-24


In [None]:
concepts = [#"C140096630",
#"C3020597237",
#"C2780451532",
#"C96250715",
#"C15952604",
#"C187691185",
"C185798385",
"C2777526511",
"C132010649",
"C18483071",
"C170477896",
"C66283442",
"C46362747",
"C2779679103",
"C74172769",
"C94487597",
"C2779356329",
"C2779918689",
"C51399673",
"C87360688",
"C138171918",
"C143916079",
"C52121051",
"C2779134260",
"C121608353",
"C104317684",
"C204787440",
"C116675565",
"C118518473",
"C2777267654"
]

import os
time_windows = '2000-2023'
search = None
limit = 200
path_ = os.getcwd()
    
for concept_id in concepts:
    print(concept_id)
    os.chdir(path_)
    if not os.path.exists(concept_id):
        os.makedirs(concept_id)
    os.chdir(concept_id)
    url = get_url(search, time_windows, concept_id, limit)
    nb_pages = get_nb_pages(url, limit)
    all_infos = []
    next_cursor = '*'
    for page in tqdm.tqdm(range(1,nb_pages+1)):
        try:
            time.sleep(1)
            page_i, next_cursor = get_articles(url,next_cursor,limit)
            for article in page_i:
                infos = get_basic_infos(article)
                all_infos.append(infos)
        except Exception as e :
            print(e)
            time.sleep(30)
            page_i, next_cursor = get_articles(url,next_cursor,limit)
            for article in page_i:
                infos = get_basic_infos(article)
                all_infos.append(infos)
            continue
          
    df = pd.DataFrame(all_infos)
    df.to_csv('Data_concept_{}.csv'.format(concept_id))
    
    works_ids = set()
    for refs in df.references.to_list():
        refs = re.sub("https://openalex.org/","",refs)
        refs = refs.split("; ")
        works_ids.update(refs)
    
    path = "mongodb://"
    import pymongo
    client = pymongo.MongoClient(path)
    db = client['Stefano']
    col = db[concept_id]
    
    
    all_infos = []
    works_ids = list(works_ids)[1:]
    for i in tqdm.tqdm(range(0,len(works_ids),100)):
        to = i+100 if i+100<len(works_ids) else len(works_ids)
        work_ids = "|".join(works_ids[i:to])
        url = "https://api.openalex.org/works?filter=ids.openalex:{}&per-page=100".format(work_ids)
        try:
            time.sleep(1)
            page_i = get_articles(url)
            for article in page_i:
                infos = get_basic_infos(article)
                col.insert_one(infos)
        except Exception as e :
            print(e)
            time.sleep(1000)
            page_i = get_articles(url)
            for article in page_i:
                infos = get_basic_infos(article)            
                col.insert_one(infos)
            continue
    
    pd.DataFrame(all_infos).to_csv('Data_ref_concept_{}.csv'.format(concept_id))
    
    df = pd.read_csv('Data_concept_{}.csv'.format(concept_id))
    #df_ref = pd.read_csv('Data_ref_concept_{}.csv'.format(concept_id))
    df_ref = pd.DataFrame([doc for doc in col.find()])
    
    df["references"] = df['references'].str.split('; ')
    
    edgelist_ref = df[["id","year","references"]].explode('references')
    edgelist_ref["references"] = edgelist_ref["references"].apply(lambda x:  re.sub("https://openalex.org/","",x) if isinstance(x,str) else None)
    
    ref_infos = df_ref[["id","source"]]
    ref_infos = ref_infos.rename(columns={'id': 'references'})
    
    merge_infos = edgelist_ref.merge(ref_infos, on = 'references', how = "left")
    
    merge_infos = merge_infos.dropna()
    
    ref_structured = merge_infos.groupby(["id","year"]).apply(lambda x: [ref for ref in x["source"]]).reset_index()
    
    ref_structured = ref_structured.rename(columns={0: 'refs'})
    
    ref_structured['id'] = ref_structured['id'].apply(lambda x: int(re.sub('W','',x)))
    if not os.path.exists("Data/docs/refs/"):
        os.makedirs("Data/docs/refs/")
    for y in range(2000,2024):
        tmp = ref_structured[ref_structured.year == y]
        json.dump(tmp.to_dict('records'),open("Data/docs/refs/"+ "{}.json".format(y),'w'))

C185798385
Number of articles matched: 232151
Number of pages: 1161


100%|███████████████████████████████████████████████████████████████████████████▉| 1160/1161 [1:00:27<00:03,  3.07s/it]

'countries'


100%|████████████████████████████████████████████████████████████████████████████| 1161/1161 [1:01:01<00:00,  3.15s/it]
 49%|███████████████████████████████████▋                                     | 9285/19018 [7:40:54<7:34:09,  2.80s/it]

Failed to retrieve data: 503
'NoneType' object is not iterable


 49%|███████████████████████████████████▊                                     | 9315/19018 [7:59:02<7:24:30,  2.75s/it]

Failed to retrieve data: 503
'NoneType' object is not iterable


100%|█████████████████████████████████████████████████████████████████████████| 19018/19018 [15:43:26<00:00,  2.98s/it]


C2777526511
Number of articles matched: 77774
Number of pages: 389


100%|████████████████████████████████████████████████████████████████████████████████| 389/389 [18:14<00:00,  2.81s/it]
100%|████████████████████████████████████████████████████████████████████████████| 8297/8297 [6:18:21<00:00,  2.74s/it]


C132010649
Number of articles matched: 31017
Number of pages: 156


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [07:07<00:00,  2.74s/it]
100%|████████████████████████████████████████████████████████████████████████████| 3602/3602 [2:37:54<00:00,  2.63s/it]


C18483071
Number of articles matched: 11604
Number of pages: 59


100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [02:41<00:00,  2.74s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 1267/1267 [59:36<00:00,  2.82s/it]


C170477896
Number of articles matched: 7732
Number of pages: 39


100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [01:46<00:00,  2.74s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 897/897 [40:52<00:00,  2.73s/it]


C66283442
Number of articles matched: 27545
Number of pages: 138


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [06:29<00:00,  2.82s/it]
100%|████████████████████████████████████████████████████████████████████████████| 2143/2143 [1:36:27<00:00,  2.70s/it]


C46362747
Number of articles matched: 327068
Number of pages: 1636


100%|████████████████████████████████████████████████████████████████████████████| 1636/1636 [1:18:55<00:00,  2.89s/it]
100%|██████████████████████████████████████████████████████████████████████████| 11696/11696 [8:46:46<00:00,  2.70s/it]


C2779679103
Number of articles matched: 202844
Number of pages: 1015


100%|██████████████████████████████████████████████████████████████████████████████| 1015/1015 [49:55<00:00,  2.95s/it]
100%|█████████████████████████████████████████████████████████████████████████| 17355/17355 [14:04:07<00:00,  2.92s/it]


C74172769
Number of articles matched: 210215
Number of pages: 1052


100%|██████████████████████████████████████████████████████████████████████████████| 1052/1052 [50:57<00:00,  2.91s/it]
100%|████████████████████████████████████████████████████████████████████████████| 8143/8143 [6:08:53<00:00,  2.72s/it]


C94487597
Number of articles matched: 118204
Number of pages: 592


100%|████████████████████████████████████████████████████████████████████████████████| 592/592 [30:20<00:00,  3.07s/it]
100%|██████████████████████████████████████████████████████████████████████████| 12191/12191 [9:56:47<00:00,  2.94s/it]


C2779356329
Number of articles matched: 89043
Number of pages: 446


100%|████████████████████████████████████████████████████████████████████████████████| 446/446 [22:38<00:00,  3.05s/it]
100%|██████████████████████████████████████████████████████████████████████████| 10833/10833 [9:01:46<00:00,  3.00s/it]


C2779918689
Number of articles matched: 115982
Number of pages: 580


100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [32:31<00:00,  3.36s/it]
100%|██████████████████████████████████████████████████████████████████████████| 10677/10677 [8:32:57<00:00,  2.88s/it]


C51399673
Number of articles matched: 73444
Number of pages: 368


100%|████████████████████████████████████████████████████████████████████████████████| 368/368 [18:09<00:00,  2.96s/it]
100%|████████████████████████████████████████████████████████████████████████████| 3531/3531 [2:46:34<00:00,  2.83s/it]


C87360688
Number of articles matched: 76455
Number of pages: 383


  5%|████▏                                                                            | 20/383 [01:05<19:27,  3.22s/it]

Failed to retrieve data: 500
cannot unpack non-iterable NoneType object


100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [19:16<00:00,  3.02s/it]
100%|████████████████████████████████████████████████████████████████████████████| 2833/2833 [2:09:35<00:00,  2.74s/it]


C138171918
Number of articles matched: 112838
Number of pages: 565


100%|████████████████████████████████████████████████████████████████████████████████| 565/565 [27:29<00:00,  2.92s/it]
100%|████████████████████████████████████████████████████████████████████████████| 5115/5115 [3:59:28<00:00,  2.81s/it]


C143916079
Number of articles matched: 72141
Number of pages: 361


100%|████████████████████████████████████████████████████████████████████████████████| 361/361 [17:54<00:00,  2.98s/it]
100%|████████████████████████████████████████████████████████████████████████████| 4386/4386 [3:31:26<00:00,  2.89s/it]


C52121051
Number of articles matched: 68007
Number of pages: 341


100%|████████████████████████████████████████████████████████████████████████████████| 341/341 [15:56<00:00,  2.80s/it]
100%|████████████████████████████████████████████████████████████████████████████| 2663/2663 [2:03:51<00:00,  2.79s/it]


C2779134260
Number of articles matched: 3583465
Number of pages: 17918


 71%|██████████████████████████████████████████████████▍                    | 12744/17918 [11:35:28<3:51:20,  2.68s/it]