In [1]:
import json
import os
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import requests
import seaborn as sns
from bs4 import BeautifulSoup

# polite API usage
headers = {
    'User-Agent': 'LitScapeExperiments/1.0 (mailto:10133433@mackenzista.com.br)'
}

In [3]:
def fetch_article_count(issn):
    """Fetches the count of articles available for a specific ISSN."""
    url = f"https://api.crossref.org/journals/{issn}/works"
    response = requests.get(url, headers=headers)
    data = response.json()
    # Extract total number of works available
    if 'message' in data and 'total-results' in data['message']:
        return data['message']['total-results']
    else:
        return 0

def list_journals(query):
    url = f"https://api.crossref.org/journals?query={query}"
    response = requests.get(url, headers=headers)
    data = response.json()

    journals = []
    for item in data.get('message', {}).get('items', []):
        journal_title = item.get('title', 'No title available')
        issn_list = item.get('ISSN', [])
        current_dois = item.get('counts', {}).get('current-dois', 0)

        # Skip if no current DOIs or list is empty
        if not current_dois or not issn_list:
            continue

        abstract_fill_rate = item.get('coverage', {}).get('abstracts-current', 0.0)

        if abstract_fill_rate < 0.5 or current_dois < 100:
            continue

        article_count = fetch_article_count(issn_list[0])

        journal_info = {
            'title': journal_title,
            'ISSN': issn_list,
            'article_count': article_count,
            'current_dois': current_dois,
            'abstract_fill_rate': abstract_fill_rate
        }
        journals.append(journal_info)

    # Save the journals list to a JSON file, if not empty
    if journals:
        with open(f'journal_queries/{query}.json', 'w') as f:
            json.dump(journals, f, indent=4)

        df_journals = pd.DataFrame(journals)
        if not df_journals.empty:
            return df_journals.sort_values(['abstract_fill_rate', 'current_dois'], ascending=False)
        else:
            return pd.DataFrame()
    else:
        return pd.DataFrame()

def fetch_journal_articles(issn, journal_name, rows=150):
    """Fetches articles and saves them as JSON."""
    url = f"https://api.crossref.org/journals/{issn}/works?rows={rows}"
    response = requests.get(url, headers=headers)
    data = response.json()
    
    articles = []
    if 'message' in data:
        for item in data['message']['items']:
            #print(item)
            article = {
                'title': item.get('title', [None])[0],
                'doi': item.get('DOI', None),
                'year': item.get('created', {}).get('date-parts', [None])[0][0],
                'abstract': item.get('abstract', '').strip(),
                'is_referenced_by_count': item.get('is-referenced-by-count', None)
            }

            # only append if it has a title and abstract
            if article['title'] is not None and article['abstract'] != '':
                articles.append(article)

    valid_rows = len(articles)

    # Save the articles to a JSON file
    with open(f"article_metadata/{journal_name}_ISSN{issn}_sample{valid_rows}.json", 'w') as f:
        json.dump(articles, f, indent=4)  # Pretty print the JSON for readability

    df_articles = pd.DataFrame(articles)

    return df_articles

In [1]:
queries = ['quantum', 'complexity', 'biology', 'psychology', 'chemistry', 'medical physics', 'machine learning']

for query in queries:

    # check if it has already been fetched
    try:
        with open(f'journal_queries/{query}.json', 'r') as f:
            journals = json.load(f)
        df_journals = pd.DataFrame(journals)
        print(f"Found existing data for {query}.")

    except FileNotFoundError:
        print("Fetching data for query:", query)
        df_journals = list_journals(query)
        if df_journals.empty:
            print(f"No journals found for {query}.")

    # Select the primary ISSN and corresponding journal name
    primary_issn = df_journals.iloc[0]['ISSN'][0]
    primary_journal = df_journals[df_journals['ISSN'].apply(lambda x: primary_issn in x)]['title'].iloc[0]
    print(f"Selected journal: {primary_journal} (ISSN: {primary_issn})")

    # Normalize the journal name for file naming
    normalized_name = primary_journal.replace(' ', '_').replace('/', '_').lower()
    
    # Attempt to fetch articles from the primary ISSN
    df_articles = fetch_journal_articles(primary_issn, journal_name=normalized_name, rows=200)
    if len(df_articles) >= 100:
        print(f"Successfully fetched {len(df_articles)} articles from {primary_journal} (ISSN: {primary_issn}).")
        continue

    # If the primary ISSN fails, try the secondary ISSN if available
    if len(df_journals.iloc[0]['ISSN']) > 1:
        secondary_issn = df_journals.iloc[0]['ISSN'][1]
        print("Trying secondary ISSN:", secondary_issn)
        secondary_journal = df_journals[df_journals['ISSN'].apply(lambda x: secondary_issn in x)]['title'].iloc[0]
        normalized_name = secondary_journal.replace(' ', '_').replace('/', '_').lower()
        
        df_articles = fetch_journal_articles(secondary_issn, journal_name=normalized_name, rows=200)
        if len(df_articles) >= 100:
            print(f"Successfully fetched {len(df_articles)} articles from {secondary_journal} (ISSN: {secondary_issn}).")
        else:
            print(f"Failed to fetch enough articles from {secondary_journal}.")
    else:
        print(f"Failed to fetch articles from {primary_journal} and no secondary ISSN available.")

## Data Cleaning and Preprocessing

In [45]:
article_metadata = os.listdir('article_metadata')
article_metadata = [file for file in article_metadata if file.endswith('.json')]

# Combine all article metadata into a single DataFrame, where a column should be the journal name
dfs = []
for file in article_metadata:
    with open(f'article_metadata/{file}', 'r') as f:
        data = json.load(f)
        df = pd.DataFrame(data)
        df['journal'] = file.split('_ISSN')[0]
        dfs.append(df)

df_all_articles_raw = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a CSV file
df_all_articles_raw.to_csv('all_articles_raw.csv', index=False)

In [46]:
def remove_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

df_all_articles = df_all_articles_raw.copy()

df_all_articles['abstract'] = df_all_articles['abstract'].apply(remove_tags)

# Remove leading "Abstract" from abstracts
df_all_articles['abstract'] = df_all_articles['abstract'].str.replace(r'^Abstract', '', regex=True)
df_all_articles['abstract'] = df_all_articles['abstract'].str.replace(r'^Abstract: ', '', regex=True)

In [47]:
df_all_articles['abstract_length'] = df_all_articles['abstract'].str.len()

In [48]:
journal_counts = df_all_articles['journal'].value_counts().reset_index()
journal_counts.columns = ['journal', 'count']

# Create the pie chart
fig = px.pie(journal_counts, values='count', names='journal',
             title='Distribuição de Artigos por Periódico',
             color_discrete_sequence=px.colors.sequential.RdBu)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(legend_title_text='Periódico')

# Show the plot
fig.show()

In [50]:
# Filter the DataFrame for abstract lengths less than 3000
filtered_data = df_all_articles[df_all_articles['abstract_length'] < 3000]

# Create the histogram using Plotly Express
fig = px.histogram(filtered_data, x='abstract_length',
                   nbins=50,  # Number of bins
                   title='Distribuição do Tamanho dos Abstracts',
                   labels={'abstract_length': 'Tamanho do Abstract'},  # Label for the x-axis
                   marginal='rug',  
                   hover_data=filtered_data.columns)

# Update layout and axis labels
fig.update_layout(
    xaxis_title='Tamanho do Abstract',
    yaxis_title='Número de Artigos',
    bargap=0.2,  # Gap between bars of adjacent location coordinates
    height=600,  # height of the plot in pixels
    width=1600   # width of the plot in pixels
)

# Show the plot
fig.show()

In [51]:
# remover abstracts com menos de 100 caracteres
df_all_articles = df_all_articles[df_all_articles['abstract_length'] > 100]

In [54]:
# save as csv
df_all_articles.to_csv('all_articles.csv', index=False)

## Embeddings

In [2]:
# load all articles
df_all_articles = pd.read_csv('all_articles.csv')

In [5]:
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import torch

def encode_abstracts(abstracts, model, tokenizer):
    embeddings = []
    
    # Wrap abstracts with tqdm for a progress bar
    for abstract in tqdm(abstracts, desc='Encoding abstracts'):
        inputs = tokenizer(abstract, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        
        hidden_states = outputs.hidden_states
        last_layer_hidden_states = hidden_states[-1]
        abstract_embeddings = last_layer_hidden_states.mean(dim=1)
        embeddings.append(abstract_embeddings.cpu())  # Call `.cpu()` to move tensors back to CPU if using GPU

    # Stack all embeddings into a single tensor
    return torch.stack(embeddings)

Encoding abstracts:   0%|          | 0/1831 [00:00<?, ?it/s]

In [None]:
# Load the SciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True)

model.to('cuda')

# Assume 'df_all_articles_raw' is your DataFrame and it has an 'abstract' column
# Apply the function to all abstracts
abstract_embeddings = encode_abstracts(df_all_articles['abstract'].tolist(), model, tokenizer)
torch.save(abstract_embeddings, 'abstract_embeddings.pt')

In [10]:
abstract_embeddings_np

array([[[ 0.22431481, -0.25318727,  0.11014143, ..., -0.28336376,
          0.21446137, -0.5627571 ]],

       [[ 0.39153066, -0.4899138 ,  0.14547488, ..., -0.35935095,
          0.25185725, -0.48777324]],

       [[ 0.24591634, -0.42300424,  0.1559297 , ..., -0.30734563,
          0.13219088, -0.87109137]],

       ...,

       [[ 0.47076297, -0.3863434 ,  0.22683762, ..., -0.0600824 ,
          0.37944347, -0.586188  ]],

       [[ 0.32036796, -0.35674062,  0.2171505 , ..., -0.10391582,
          0.48318368, -0.9345723 ]],

       [[ 0.26914433, -0.6028914 ,  0.08458277, ..., -0.16385165,
          0.2666776 , -0.6494315 ]]], dtype=float32)

In [14]:
print(abstract_embeddings.shape)
abstract_embeddings_mean = abstract_embeddings.mean(dim=1)
print(abstract_embeddings_mean.shape)

torch.Size([1831, 1, 768])
torch.Size([1831, 768])


In [15]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

abstract_embeddings_np = abstract_embeddings_mean.detach().cpu().numpy()

tsne2d = TSNE(n_components=2, random_state=42)
tsne3d = TSNE(n_components=3, random_state=42)

embeddings_2d = tsne2d.fit_transform(abstract_embeddings_np)
embeddings_3d = tsne3d.fit_transform(abstract_embeddings_np)

df_all_articles['tsne_x_2d'] = embeddings_2d[:, 0]
df_all_articles['tsne_y_2d'] = embeddings_2d[:, 1]

df_all_articles['tsne_x_3d'] = embeddings_3d[:, 0]
df_all_articles['tsne_y_3d'] = embeddings_3d[:, 1]
df_all_articles['tsne_z_3d'] = embeddings_3d[:, 2]

fig = px.scatter(df_all_articles, x='tsne_x', y='tsne_y', color='journal',
                 title='t-SNE of Abstract Embeddings')
fig.show()


In [None]:
fig = px.scatter_3d(df_all_articles, x='tsne_x_3d', y='tsne_y', z_ color='journal',
                 title='t-SNE of Abstract Embeddings')
fig.show()