<a href="https://colab.research.google.com/github/masadeghi/journal_finder/blob/main/scimagojr_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dependencies

Note: The output datasets of this script are available in the scraped_from_scimago directory of the github repo.

In [3]:
!git clone https://github.com/masadeghi/journal_finder

Cloning into 'journal_finder'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (12/12), done.


In [69]:
from bs4 import BeautifulSoup
import requests

# Setting the User Agent for requests to prevent IP blocking
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (Linux; Android 12; SM-T875) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
})

# Extract Rank, Sourceid, SJR, and Quartile information from scimagojr.com datasets

Each dataset belongs to a specific subject area on the scimago website. The datasets included here are:
* Biochemistry, Genetics and Molecular Biology
* Immunology and Microbiology
* Medicine
* Neuroscience
* Pharmacology, Toxicology, and Pharmaceutics


In [50]:
import csv
import pandas as pd
import numpy as np


def journal_data_from_dataset(file_path):
  """
  Given the path of a .csv file downloaded from scimagojr.com, extract Rank,
  Sourceid, Title, SJR, and Quartile and present them in a pandas DataFrame.

  Args:
    file_path (str): The path to your file.
  
  Returns:
    journals_table (DataFrame): a pandas DataFrame with the following columns:
    ['Rank', 'Sourceid', 'Title', 'SJR', 'Q1]
  """

  # pd.read_csv(sep = ";") could not read the file correctly. Therefore, we have
  # to read line by line.

  # Initialize a dataframe to store journal information.
  # Note: The 'Categories' column in the scimago datasets has a
  # variable number of categories for each journal. This combined with the fact
  # that these categories are separated by semicolons poses a problem for 
  # reading the file. To prevent any exceptions, we initialize the dataframe
  # to have an arbitrarily large number of columns (e.g., 50) so it could
  # accommodate any number of categories.

  journals_table = pd.DataFrame(columns = range(50))

  with open(file_path) as file:
    reader_obj = csv.reader(file, delimiter = "\n")
    for i, row in enumerate(reader_obj):
      split_row = row[0].split(sep = ";")
      # Pad each row with NaN
      journals_table.loc[i, :] = split_row + [np.nan] * (len(journals_table.columns) - len(split_row))

  journals_table.columns = journals_table.loc[0, :] # Rename Dataframe column names with first row
  journals_table = journals_table.drop(0, axis = 0) # Remove first row
  journals_table = journals_table[["Rank", "Sourceid", "Title", "SJR", "SJR Best Quartile"]] # Select desired columns
  journals_table["SJR"] = pd.to_numeric(journals_table["SJR"].str.replace(",", "."), errors = 'coerce') # Convert SJRs to float number

  journals_table = journals_table.reset_index(drop = True) # Reset the index after all our modifications
  
  return journals_table

In [60]:
# Extract information from each of the datasets
biochem_molbio_dataset = journal_data_from_dataset("/content/journal_finder/scimago_datasets/scimagojr 2021  Subject Area - Biochemistry, Genetics and Molecular Biology.csv")
immuno_micro_dataset = journal_data_from_dataset("/content/journal_finder/scimago_datasets/scimagojr 2021  Subject Area - Immunology and Microbiology.csv")
med_dataset = journal_data_from_dataset("/content/journal_finder/scimago_datasets/scimagojr 2021  Subject Area - Medicine.csv")
neuro_dataset = journal_data_from_dataset("/content/journal_finder/scimago_datasets/scimagojr 2021  Subject Area - Neuroscience.csv")
pharma_toxico_dataset = journal_data_from_dataset("/content/journal_finder/scimago_datasets/scimagojr 2021  Subject Area - Pharmacology, Toxicology and Pharmaceutics.csv")

In [61]:
pharma_toxico_dataset.head()

Unnamed: 0,Rank,Sourceid,Title,SJR,SJR Best Quartile
0,1,20425,"""Nature Reviews Drug Discovery""",11.296,Q1
1,2,21191,"""Pharmacological Reviews""",5.54,Q1
2,3,19479,"""Annual Review of Pharmacology and Toxicology""",4.002,Q1
3,4,4700152457,"""Nano Today""",3.89,Q1
4,5,12611,"""Drug Resistance Updates""",3.845,Q1


# Make a list of journal URLs using each journal's Sourceid

In [62]:
# Make URL-list using the Sourceid for each journal
def dataset_to_url(dataset):
  """
  Given a dataset with a column "Sourceid", a "URL" column to the dataset 
  containing the URLs for each journal

  Args:
    dataset (DataFrame): A DataFrame containing a "Sourceid" column.
  
  Returns:
    dataset (DataFrame): The input dataset with an added column "URL" containing
    the URLs for each journal
  """
  URL_list = ['https://www.scimagojr.com/journalsearch.php?q=' + i + '&tip=sid&clean=0' for i in dataset["Sourceid"]]
  dataset["URL"] = URL_list

  return dataset

In [63]:
biochem_molbio_dataset = dataset_to_url(biochem_molbio_dataset)
immuno_micro_dataset = dataset_to_url(immuno_micro_dataset)
med_dataset = dataset_to_url(med_dataset)
neuro_dataset = dataset_to_url(neuro_dataset)
pharma_toxico_dataset = dataset_to_url(pharma_toxico_dataset)

In [64]:
pharma_toxico_dataset.head()

Unnamed: 0,Rank,Sourceid,Title,SJR,SJR Best Quartile,URL
0,1,20425,"""Nature Reviews Drug Discovery""",11.296,Q1,https://www.scimagojr.com/journalsearch.php?q=...
1,2,21191,"""Pharmacological Reviews""",5.54,Q1,https://www.scimagojr.com/journalsearch.php?q=...
2,3,19479,"""Annual Review of Pharmacology and Toxicology""",4.002,Q1,https://www.scimagojr.com/journalsearch.php?q=...
3,4,4700152457,"""Nano Today""",3.89,Q1,https://www.scimagojr.com/journalsearch.php?q=...
4,5,12611,"""Drug Resistance Updates""",3.845,Q1,https://www.scimagojr.com/journalsearch.php?q=...


# Define function for extracting relevant information from each journal page

In [67]:
def extract_journal_info(journal_url):
  """
  Given a journal url from the scimago website, extract journal name and scope.

  Args:
    journal_url (str): The URL for the journal page on scimagojr.com.
  
  Returns:
    a dictionary containing {name : 'journal_name', scope: 'journal_scope'}
  """

  # Make a request to journal_url
  # Store the result in 'page' variable
  page = requests.get(journal_url)
  txt = page.text
  status = page.status_code

  #parse page text
  soup = BeautifulSoup(page.content, 'html.parser')

  # Extract journal name (page title)
  journal_name = soup.title.text
  journal_name = journal_name.lower()

  # Create a list of h2 tags from the page ('Scope' is an h2 tag)
  all_h2_tags = []
  for element in soup.select('h2'):
    all_h2_tags.append(element.text)

  # Extracting the scope
  if 'Scope' in all_h2_tags:
    journal_scope = page.text.split("Scope</h2>",1)[1]
    journal_scope = journal_scope.split("<a", 1)[0]
    journal_scope = journal_scope.strip()
    journal_scope = journal_scope.lower()
  else:
    journal_scope = None

  
  journal_info = {'name' : journal_name,
                  'scope' : journal_scope}
  
  return journal_info
  

In [70]:
extract_journal_info('https://www.scimagojr.com/journalsearch.php?q=25939&tip=sid&clean=0')

{'name': 'macromolecular rapid communications',
 'scope': 'macromolecular rapid communications publishes original research in polymer science, ranging from chemistry and physics of polymers to polymers in materials science and life sciences.'}

# Scrape each journal's webpage using its URLs to extract its scope; add the scopes to each category's dataset

In [84]:
from time import sleep
from random import randint

def scimago_scraper(dataset):
  """
  Given a dataset containing a "URL" column, scrape each URL and extract
  the Scope of each journal from its webpage.

  Args:
    dataset (DataFrame): A dataset containing a "URL" column

  Returns:
    dataset (Dataframe): The input dataset with an added "Scope" column
    which contains the scope of each journal.
  """
  complete_dataset = dataset.copy()

  # Initialize a 'Scope' column in the dataset dataframe
  complete_dataset["Scope"] = [np.nan] * len(complete_dataset)

  URL_list = complete_dataset["URL"].tolist()

  for i, url in enumerate(URL_list):
    journal_info = extract_journal_info(url)
    complete_dataset.loc[i, "Scope"] = journal_info["scope"]
    if i % 500 == 0 and i != 0:
      print('Successfuly went through 500 pages')
    sleep(randint(1, 3)) # Random time gaps between requests to prevent IP blocking by the server

  return complete_dataset

In [None]:
biochem_molbio_dataset = scimago_scraper(biochem_molbio_dataset)

In [96]:
immuno_micro_dataset = scimago_scraper(immuno_micro_dataset)

Successfuly went through 500 pages


In [None]:
med_dataset = scimago_scraper(med_dataset)

In [None]:
pharma_toxico_dataset = scimago_scraper(pharma_toxico_dataset)

In [None]:
neuro_dataset = scimago_scraper(neuro_dataset)

In [90]:
pharma_toxico_dataset.head()

Unnamed: 0,Rank,Sourceid,Title,SJR,SJR Best Quartile,URL,Scope
0,1,20425,"""Nature Reviews Drug Discovery""",11.296,Q1,https://www.scimagojr.com/journalsearch.php?q=...,nature reviews drug discovery is a monthly jou...
1,2,21191,"""Pharmacological Reviews""",5.54,Q1,https://www.scimagojr.com/journalsearch.php?q=...,pharmacological reviews presents important rev...
2,3,19479,"""Annual Review of Pharmacology and Toxicology""",4.002,Q1,https://www.scimagojr.com/journalsearch.php?q=...,the annual review of pharmacology and toxicolo...
3,4,4700152457,"""Nano Today""",3.89,Q1,https://www.scimagojr.com/journalsearch.php?q=...,nano today publishes original articles on all ...
4,5,12611,"""Drug Resistance Updates""",3.845,Q1,https://www.scimagojr.com/journalsearch.php?q=...,drug resistance updates is a bimonthly publica...


# Save the final datasets

In [95]:
biochem_molbio_dataset.to_csv('biochem_molbio_journals.csv', index = False)

In [97]:
immuno_micro_dataset.to_csv('immuno_micro_journals.csv', index = False)

In [None]:
med_dataset.to_csv('med_journals.csv', index = False)

In [93]:
pharma_toxico_dataset.to_csv('pharma_toxico_journals.csv', index = False)

In [92]:
neuro_dataset.to_csv('neuro_journals.csv', index = False)