In [1]:
import tqdm
import re
import pandas as pd
import math
import time
import requests
import json

In [18]:
def get_abstract(words):
    n = max({idx  for name in words for idx in words[name]})+1
    temp_list = [' ']*n
    for word in words:
        idxs = words[word]
        for idx in idxs:
            temp_list[idx] = word
    abstract = ' '.join(temp_list)
    return abstract

import re

def extract_source_id(primary_location):
    """Extracts and returns the source ID from the primary location."""
    source = primary_location.get('source')
    if not source or "id" not in source:
        return None
    return re.sub("https://openalex.org/", "", source['id'])

def extract_authorship_details(authorships):
    """Extracts and returns author IDs, countries, and institutions from authorships."""
    author_ids = []
    countries = []
    institutions = []
    
    for auth in authorships:
        # Extract author ID
        author_id = auth['author'].get('id')
        if author_id:
            author_ids.append(re.sub("https://openalex.org/", "", author_id))
        
        # Extract country
        if auth.get('countries'):
            countries.append(auth['countries'][0])
        
        # Extract institution ID
        if auth.get('institutions'):
            institutions.append(auth['institutions'][0]['id'])

    return '; '.join(author_ids), '; '.join(countries), '; '.join(institutions)

def get_basic_infos(doc):
    """Extracts basic information from the document."""
    list_concepts = [concept['id'] for concept in doc.get('concepts', [])]
    
    # Extracting source information
    Source = extract_source_id(doc.get('primary_location', {}))
    
    # Extracting authorship details
    auth_ids, countries, institutions = extract_authorship_details(doc.get('authorships', []))

    infos = {
        'id': re.sub("https://openalex.org/", "", doc['id']),
        'doi': doc.get('doi'),
        'year': doc.get('publication_year'),
        'language': doc.get('language'),
        'type': doc.get('type'),
        'source': Source,
        'nb_auth': len(doc.get('authorships', [])),
        'auth_ids': auth_ids,
        'countries': countries,
        'institutions': institutions,
        'nb_citations': doc.get('cited_by_count'),
        'title': doc.get('title'),
        'abstract': get_abstract(doc.get('abstract_inverted_index')) if doc.get('abstract_inverted_index') else None,
        'nb_ref': doc.get('referenced_works_count', 0),
        'concepts': '; '.join(list_concepts),
        'references': '; '.join(doc.get('referenced_works', []))
    }
    
    return infos


def get_nb_pages(url, limit=50):
    """Fetches the total number of pages based on the total count of articles and limit per page."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data: {response.status_code}")
        return None

    data = response.json()
    count = data['meta']['count']
    nb_pages = math.ceil(count / limit)
    print(f"Number of articles matched: {count}")
    print(f"Number of pages: {nb_pages}")
    return nb_pages


def get_articles(url, cursor=None, limit=100):
    """Fetches articles from a given URL. If cursor is provided, fetches the next set of articles."""
    if cursor:
        url += f"&cursor={cursor}"
    
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data: {response.status_code}")
        return None
    data = response.json()
    articles = data['results']
    next_cursor = data['meta'].get('next_cursor')
    return (articles, next_cursor) if cursor else articles


def build_url(search=None, time_windows=None, concept_id=None, limit=100):
    """Builds the search URL for OpenAlex API based on search terms, time windows, and concept ID."""
    base_url = "https://api.openalex.org/works?"
    filters = []

    if search:
        search_query = search.replace(' ', '+')
        base_url += f"search={search_query}"
    
    if time_windows or concept_id:
        filter_ = '&filter=' if search else 'filter='
        base_url += filter_
        
        if time_windows:
            filters.append(f"publication_year:{time_windows}")
        if concept_id:
            filters.append(f"concepts.id:{concept_id}")

        base_url += ','.join(filters)

    # # Append mailto and limit
    # base_url += f"&mailto=s.bianchini@unistra.fr&per-page={limit}"
    
    return base_url

In [14]:
url_kernel = build_url(concept_id="https://openalex.org/C140417398")
url_kernel

'https://api.openalex.org/works?filter=concepts.id:https://openalex.org/C140417398'

In [15]:
lst_articles_kernel = get_articles(url_kernel, limit=50)
lst_articles_kernel[0]

{'id': 'https://openalex.org/W1560724230',
 'doi': 'https://doi.org/10.7551/mitpress/4175.001.0001',
 'title': 'Learning with Kernels',
 'display_name': 'Learning with Kernels',
 'publication_year': 2018,
 'publication_date': '2018-01-01',
 'ids': {'openalex': 'https://openalex.org/W1560724230',
  'doi': 'https://doi.org/10.7551/mitpress/4175.001.0001',
  'mag': '1560724230'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.7551/mitpress/4175.001.0001',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306463997',
   'display_name': 'The MIT Press eBooks',
   'issn_l': None,
   'issn': None,
   'is_oa': False,
   'is_in_doaj': False,
   'is_core': False,
   'host_organization': 'https://openalex.org/P4310315718',
   'host_organization_name': 'The MIT Press',
   'host_organization_lineage': ['https://openalex.org/P4310315718'],
   'host_organization_lineage_names': ['The MIT Press'],
   'type': 'ebook platform'},
  'license'

In [16]:
get_nb_pages(url_kernel)

Number of articles matched: 1826
Number of pages: 37


37

In [17]:
get_basic_infos(lst_articles_kernel[0])

{'id': 'W1560724230',
 'doi': 'https://doi.org/10.7551/mitpress/4175.001.0001',
 'year': 2018,
 'language': 'en',
 'type': 'book',
 'source': 'S4306463997',
 'nb_auth': 2,
 'auth_ids': '5044005697; 5000245150',
 'countries': 'DE',
 'institutions': 'https://openalex.org/I149899117',
 'nb_citations': 7922,
 'title': 'Learning with Kernels',
 'abstract': 'A comprehensive introduction to Support Vector Machines and related kernel methods.In the 1990s, a new type of learning algorithm was developed, based on results from statistical learning theory: the Support Vector Machine (SVM). This gave rise to a new class of theoretically elegant learning machines that use a central concept of SVMs—-kernels—for a number of learning tasks. Kernel machines provide a modular framework that can be adapted to different tasks and domains by the choice of the kernel function and the base algorithm. They are replacing neural networks in a variety of fields, including engineering, information retrieval, and b

In [7]:
def get_concept_id(query):
    query = query.replace(' ', '+')
    url = f"https://api.openalex.org/concepts?search={query}&per-page=200"
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Failed to retrieve data:", response.status_code)
        return None
    data = response.json()
    print(f"Number of concepts with {query} in display_name: {data['meta']['count']}")
    concepts = data['results']
    return concepts

    
    
concepts = get_concept_id('History of childhood')
pd.DataFrame(concepts)

Number of concepts with History+of+childhood in display_name: 1


Unnamed: 0,id,wikidata,display_name,relevance_score,level,description,works_count,cited_by_count,summary_stats,ids,image_url,image_thumbnail_url,international,ancestors,related_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/C2777583440,https://www.wikidata.org/wiki/Q4204823,History of childhood,4146.808,5,aspect of history,416,9447,"{'2yr_mean_citedness': 0.8666666666666667, 'h_...",{'openalex': 'https://openalex.org/C2777583440...,https://upload.wikimedia.org/wikipedia/commons...,https://upload.wikimedia.org/wikipedia/commons...,"{'display_name': {'ar': 'تاريخ الطفولة', 'ast'...","[{'id': 'https://openalex.org/C2993946119', 'w...","[{'id': 'https://openalex.org/C2777464741', 'w...","[{'year': 2024, 'works_count': 6, 'cited_by_co...",https://api.openalex.org/works?filter=concepts...,2024-09-08T11:45:37.971149,2018-01-05


In [8]:
concepts

[{'id': 'https://openalex.org/C2777583440',
  'wikidata': 'https://www.wikidata.org/wiki/Q4204823',
  'display_name': 'History of childhood',
  'relevance_score': 4146.808,
  'level': 5,
  'description': 'aspect of history',
  'works_count': 416,
  'cited_by_count': 9447,
  'summary_stats': {'2yr_mean_citedness': 0.8666666666666667,
   'h_index': 49,
   'i10_index': 140},
  'ids': {'openalex': 'https://openalex.org/C2777583440',
   'wikidata': 'https://www.wikidata.org/wiki/Q4204823',
   'mag': '2777583440',
   'wikipedia': 'https://en.wikipedia.org/wiki/History%20of%20childhood'},
  'image_url': 'https://upload.wikimedia.org/wikipedia/commons/c/c0/Su_Han_Ch%27en_001.jpg',
  'image_thumbnail_url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c0/Su_Han_Ch%27en_001.jpg/55px-Su_Han_Ch%27en_001.jpg',
  'international': {'display_name': {'ar': 'تاريخ الطفولة',
    'ast': 'Historia de la infancia',
    'be-tarask': 'гісторыя дзяцінства',
    'ca': 'història de la infància',
    'e

In [9]:
import logging
import os
import time
import tqdm
import pandas as pd
""" 6 levels
    NOTSET=0.
    DEBUG=10.
    INFO=20.
    WARN=30.
    ERROR=40.
    CRITICAL=50.
"""

# Set up logging
logging.basicConfig(filename='app.log', 
                    filemode='w',  # write/erase to the log file
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    level=logging.DEBUG)  # Set log level to INFO
logger = logging.getLogger(__name__)

In [10]:
time_windows = '2000-2023'
search = None
limit = 200

for concept in concepts:
    logger.info(f"Concept: {concept}")
    concept_id = concept['id']
    url = build_url(search, time_windows, concept_id=concept_id, limit=50)
    nb_pages = get_nb_pages(url, limit)
    all_infos = []
    next_cursor = '*'

    for page in tqdm.tqdm(range(1, nb_pages+1)):
        logger.info(f"Page: {page}\n")
        try:
            time.sleep(1)
            page_i, next_cursor = get_articles(url, next_cursor, limit)

            for article in page_i:
                if not article:
                    # Log relevant info to a file
                    logger.warning(f"NoneType article found on page {page} with cursor {next_cursor}")
                    logger.info(f"NoneType from page_i\n{page_i}")
                    continue  # Skip this article

                infos = get_basic_infos(article)
                all_infos.append(infos)

        except Exception as e:
            # Log exceptions
            logger.error(f"Error on page {page} with cursor {next_cursor}: {str(e)}")

    df = pd.DataFrame(all_infos)


Number of articles matched: 348
Number of pages: 2


100%|██████████| 2/2 [00:03<00:00,  1.58s/it]


In [11]:
df

Unnamed: 0,id,doi,year,language,type,source,nb_auth,auth_ids,countries,institutions,nb_citations,title,abstract,nb_ref,concepts,references
0,W2558098334,https://doi.org/10.1192/bjp.bp.115.180752,2017,en,review,127936300.0,4,5012607366; 5061647665; 5036458860; 5006413851,DE; DE; DE; DE,https://openalex.org/I22465464; https://openal...,536,Childhood maltreatment and characteristics of ...,Background Childhood maltreatment has been dis...,40,https://openalex.org/C2776867660; https://open...,https://openalex.org/W1581410893; https://open...
1,W1575185410,https://doi.org/10.2307/3595208,2002,en,article,941564300.0,2,5040198173; 5065249263,,,456,A History of Childhood. Children and Childhood...,Acknowledgements. Introduction. Part I: Changi...,0,https://openalex.org/C2777583440; https://open...,
2,W1996099626,https://doi.org/10.1023/b:ccfp.0000030287.1316...,2004,en,review,10577760.0,2,5035323246; 5032478677,US; US,https://openalex.org/I192396691; https://opena...,284,Clarifying Parent–Child Reciprocities During E...,,138,https://openalex.org/C15744967; https://openal...,https://openalex.org/W1011193398; https://open...
3,W1637306181,https://doi.org/10.1016/s0145-2134(01)00261-7,2001,en,article,53055220.0,2,5077075548; 5012116335,GB,https://openalex.org/I36234482,268,Child maltreatment in the “Children of the nin...,To identify and validate factors within the pa...,47,https://openalex.org/C2992354236; https://open...,https://openalex.org/W109204797; https://opena...
4,W2794814746,https://doi.org/10.1086/ahr.110.5.1521,2005,en,article,197437600.0,1,5069139572,US,https://openalex.org/I226560621,261,:Huck's Raft: A History of American Childhood,No subject makes adults today more anxious tha...,0,https://openalex.org/C199033989; https://opena...,
5,W1994025460,https://doi.org/10.4088/jcp.v65n0217,2004,en,article,17992710.0,4,5033561394; 5031117541; 5010388772; 5035775992,US,https://openalex.org/I32971472,249,Childhood Maltreatment as a Risk Factor for Ad...,Traumatic experiences in childhood are linked ...,23,https://openalex.org/C2776867660; https://open...,https://openalex.org/W104224639; https://opena...
6,W627204269,,2006,en,book,,1,5063506392,,,155,The Invention of Childhood,The Invention of Childhood will paint a vivid ...,0,https://openalex.org/C2777583440; https://open...,
7,W2019629008,https://doi.org/10.1080/10538712.2012.659803,2012,en,article,168051000.0,2,5069480344; 5012350155,AU; NZ,https://openalex.org/I2799535966,147,Shame and Guilt in Men Exposed to Childhood Se...,This study examined the experiences of shame a...,50,https://openalex.org/C2781283010; https://open...,https://openalex.org/W114909792; https://opena...
8,W2069215381,https://doi.org/10.1016/j.schres.2011.03.017,2011,en,article,144105600.0,11,5073634667; 5014286457; 5004402420; 5045508438...,GB; GB; GB; GB; GB; GB; GB; GB; GB; GB; GB,https://openalex.org/I183935753; https://opena...,108,Childhood trauma and cognitive function in fir...,A history of childhood trauma is reportedly mo...,62,https://openalex.org/C2779727114; https://open...,https://openalex.org/W1664254939; https://open...
9,W2155682005,https://doi.org/10.1017/s0033291712001729,2012,en,article,71144980.0,6,5073962723; 5068944517; 5080186393; 5102154734...,US; US; US; US; US; US,https://openalex.org/I78577930; https://openal...,107,"Combined role of childhood maltreatment, famil...",Background Studies of the relationship between...,68,https://openalex.org/C2776289891; https://open...,https://openalex.org/W1519854000; https://open...


In [None]:
 
    works_ids = set()
    for refs in df.references.to_list():
        refs = re.sub("https://openalex.org/","",refs)
        refs = refs.split("; ")
        works_ids.update(refs)
    
    path = "mongodb://"
    import pymongo
    client = pymongo.MongoClient(path)
    db = client['Stefano']
    col = db[concept_id]
    
    
    all_infos = []
    works_ids = list(works_ids)[1:]
    for i in tqdm.tqdm(range(0,len(works_ids),100)):
        to = i+100 if i+100<len(works_ids) else len(works_ids)
        work_ids = "|".join(works_ids[i:to])
        url = "https://api.openalex.org/works?filter=ids.openalex:{}&per-page=100".format(work_ids)
        try:
            time.sleep(1)
            page_i = get_articles(url)
            for article in page_i:
                infos = get_basic_infos(article)
                col.insert_one(infos)
        except Exception as e :
            print(e)
            time.sleep(1000)
            page_i = get_articles(url)
            for article in page_i:
                infos = get_basic_infos(article)            
                col.insert_one(infos)
            continue
    
    pd.DataFrame(all_infos).to_csv('Data_ref_concept_{}.csv'.format(concept_id))
    
    df = pd.read_csv('Data_concept_{}.csv'.format(concept_id))
    #df_ref = pd.read_csv('Data_ref_concept_{}.csv'.format(concept_id))
    df_ref = pd.DataFrame([doc for doc in col.find()])
    
    df["references"] = df['references'].str.split('; ')
    
    edgelist_ref = df[["id","year","references"]].explode('references')
    edgelist_ref["references"] = edgelist_ref["references"].apply(lambda x:  re.sub("https://openalex.org/","",x) if isinstance(x,str) else None)
    
    ref_infos = df_ref[["id","source"]]
    ref_infos = ref_infos.rename(columns={'id': 'references'})
    
    merge_infos = edgelist_ref.merge(ref_infos, on = 'references', how = "left")
    
    merge_infos = merge_infos.dropna()
    
    ref_structured = merge_infos.groupby(["id","year"]).apply(lambda x: [ref for ref in x["source"]]).reset_index()
    
    ref_structured = ref_structured.rename(columns={0: 'refs'})
    
    ref_structured['id'] = ref_structured['id'].apply(lambda x: int(re.sub('W','',x)))
    if not os.path.exists("Data/docs/refs/"):
        os.makedirs("Data/docs/refs/")
    for y in range(2000,2024):
        tmp = ref_structured[ref_structured.year == y]
        json.dump(tmp.to_dict('records'),open("Data/docs/refs/"+ "{}.json".format(y),'w'))