In [73]:
import requests
import json
import pandas as pd
from datetime import datetime

# polite API usage
headers = {
    'User-Agent': 'LitScapeExperiments/1.0 (mailto:10133433@mackenzista.com.br)'
}

In [210]:
def fetch_article_count(issn):
    """Fetches the count of articles available for a specific ISSN."""
    url = f"https://api.crossref.org/journals/{issn}/works"
    response = requests.get(url, headers=headers)
    data = response.json()
    # Extract total number of works available
    if 'message' in data and 'total-results' in data['message']:
        return data['message']['total-results']
    else:
        return 0

def list_journals(query):
    url = f"https://api.crossref.org/journals?query={query}"
    response = requests.get(url, headers=headers)
    data = response.json()

    journals = []
    for item in data.get('message', {}).get('items', []):
        journal_title = item.get('title', 'No title available')
        issn_list = item.get('ISSN', [])
        current_dois = item.get('counts', {}).get('current-dois', 0)

        # Skip if no current DOIs or list is empty
        if not current_dois or not issn_list:
            continue

        abstract_fill_rate = item.get('coverage', {}).get('abstracts-current', 0.0)

        if abstract_fill_rate < 0.5 or current_dois < 100:
            continue

        article_count = fetch_article_count(issn_list[0])

        journal_info = {
            'title': journal_title,
            'ISSN': issn_list,
            'article_count': article_count,
            'current_dois': current_dois,
            'abstract_fill_rate': abstract_fill_rate
        }
        journals.append(journal_info)

    # Save the journals list to a JSON file, if not empty
    if journals:
        with open(f'journal_queries/{query}.json', 'w') as f:
            json.dump(journals, f, indent=4)

        df_journals = pd.DataFrame(journals)
        if not df_journals.empty:
            return df_journals.sort_values(['abstract_fill_rate', 'current_dois'], ascending=False)
        else:
            return pd.DataFrame()
    else:
        return pd.DataFrame()

def fetch_journal_articles(issn, journal_name, rows=150):
    """Fetches articles and saves them as JSON."""
    url = f"https://api.crossref.org/journals/{issn}/works?rows={rows}"
    response = requests.get(url, headers=headers)
    data = response.json()
    
    articles = []
    if 'message' in data:
        for item in data['message']['items']:
            #print(item)
            article = {
                'title': item.get('title', [None])[0],
                'doi': item.get('DOI', None),
                'year': item.get('created', {}).get('date-parts', [None])[0][0],
                'abstract': item.get('abstract', '').strip(),
                'is_referenced_by_count': item.get('is-referenced-by-count', None)
            }

            # only append if it has a title and abstract
            if article['title'] is not None and article['abstract'] != '':
                articles.append(article)

    valid_rows = len(articles)

    # Save the articles to a JSON file
    with open(f"article_metadata/{journal_name}_ISSN{issn}_sample{valid_rows}.json", 'w') as f:
        json.dump(articles, f, indent=4)  # Pretty print the JSON for readability

    df_articles = pd.DataFrame(articles)

    return df_articles

In [211]:
df_journals = list_journals('complexity')

In [212]:
# Starting the loop over each query
for query in ['quantum', 'complexity', 'biology', 'psychology', 'chemistry', 'medical physics', 'machine learning']:

    # check if it has already been fetched
    try:
        with open(f'journal_queries/{query}.json', 'r') as f:
            journals = json.load(f)
        df_journals = pd.DataFrame(journals)
        print(f"Found existing data for {query}.")

    except FileNotFoundError:
        print("Fetching data for query:", query)
        df_journals = list_journals(query)
        if df_journals.empty:
            print(f"No journals found for {query}.")

    # Select the primary ISSN and corresponding journal name
    primary_issn = df_journals.iloc[0]['ISSN'][0]
    primary_journal = df_journals[df_journals['ISSN'].apply(lambda x: primary_issn in x)]['title'].iloc[0]
    print(f"Selected journal: {primary_journal} (ISSN: {primary_issn})")

    # Normalize the journal name for file naming
    normalized_name = primary_journal.replace(' ', '_').replace('/', '_').lower()
    
    # Attempt to fetch articles from the primary ISSN
    df_articles = fetch_journal_articles(primary_issn, journal_name=normalized_name, rows=200)
    if len(df_articles) >= 100:
        print(f"Successfully fetched {len(df_articles)} articles from {primary_journal} (ISSN: {primary_issn}).")
        continue

    # If the primary ISSN fails, try the secondary ISSN if available
    if len(df_journals.iloc[0]['ISSN']) > 1:
        secondary_issn = df_journals.iloc[0]['ISSN'][1]
        print("Trying secondary ISSN:", secondary_issn)
        secondary_journal = df_journals[df_journals['ISSN'].apply(lambda x: secondary_issn in x)]['title'].iloc[0]
        normalized_name = secondary_journal.replace(' ', '_').replace('/', '_').lower()
        
        df_articles = fetch_journal_articles(secondary_issn, journal_name=normalized_name, rows=200)
        if len(df_articles) >= 100:
            print(f"Successfully fetched {len(df_articles)} articles from {secondary_journal} (ISSN: {secondary_issn}).")
        else:
            print(f"Failed to fetch enough articles from {secondary_journal}.")
    else:
        print(f"Failed to fetch articles from {primary_journal} and no secondary ISSN available.")


Found existing data for quantum.
Selected journal: Quantum (ISSN: 2521-327X)
Successfully fetched 200 articles from Quantum (ISSN: 2521-327X).
Found existing data for complexity.
Selected journal: Complexity (ISSN: 1076-2787)
Successfully fetched 170 articles from Complexity (ISSN: 1076-2787).
Found existing data for biology.
Selected journal: Biology (ISSN: 2079-7737)
Successfully fetched 184 articles from Biology (ISSN: 2079-7737).
Found existing data for psychology.
Selected journal: Applied Psychology (ISSN: 0269-994X)
Trying secondary ISSN: 1464-0597
Failed to fetch enough articles from Applied Psychology.
Found existing data for chemistry.
Selected journal: ChemistryOpen (ISSN: 2191-1363)
Successfully fetched 136 articles from ChemistryOpen (ISSN: 2191-1363).
Found existing data for medical physics.
Selected journal: Medical Physics (ISSN: 0094-2405)
Successfully fetched 107 articles from Medical Physics (ISSN: 0094-2405).
Found existing data for machine learning.
Selected journa

In [91]:
query = "psychology"
df_journals = list_journals(query)

In [93]:
selected_issn = '0012-1649'
journal_name = df_journals[df_journals['ISSN'].apply(lambda x: selected_issn in x)]['title'].values[0]
journal_name = journal_name.replace(' ', '_').replace('/', '_').lower()

In [94]:
df_articles = fetch_journal_articles(selected_issn, journal_name=journal_name, rows=200)

In [69]:
df_articles

Unnamed: 0,title,doi,year,abstract,is_referenced_by_count
0,Interval Type-2 Fuzzy Multiattribute Group Dec...,10.1155/2019/6727259,2019,<jats:p>Logistics service (LS) has key impacts...,0
1,3D Reconstruction of Pedestrian Trajectory wit...,10.1155/2018/8735846,2018,<jats:p>An inertial measurement unit-based ped...,3
2,Integrated Estimation/Guidance Law against Exo...,10.1155/2018/7470823,2018,<jats:p>An integrated guidance integrated esti...,1
3,An Empirical Study on the Agglomeration Charac...,10.1155/2021/5539047,2021,<jats:p>The spatiotemporal agglomeration of in...,0
4,How Price-Based Frequency Regulation Impacts S...,10.1155/2020/6297134,2020,<jats:p>With the deregulation of modern power ...,4
...,...,...,...,...,...
123,Effect of Physically Realistic Potential Energ...,10.1155/2023/8852349,2023,<jats:p>Collective motion models most often us...,0
124,The Orbital Stability of Solitary Wave Solutio...,10.1155/2019/4209275,2019,"<jats:p>In this paper, the orbital stability o...",3
125,"Symmetry Groups, Similarity Reductions, and Co...",10.1155/2020/4830684,2020,"<jats:p>In this paper, the time-fractional Fuj...",6
126,Forecasting Natural Gas Consumption of China U...,10.1155/2020/3257328,2020,"<jats:p>As is known, natural gas consumption h...",7
