In [1]:
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
import requests
import os

In [16]:
filters = []
subjects = []
journal_filters = []
journals = []

url = f"https://www.mdpi.com/search?sort=pubdate&page_count=10&year_from=2024"

with requests.get(url) as response:
    soup = BeautifulSoup(response.content, 'html.parser')
        
subject_div = soup.find('div', class_='filter-container-subjects')
subject_boxes = subject_div.find_all('div', class_='remove-filter-container')

for div in subject_boxes:
    subject = div.find('label').text.strip()
    filter_id = div.find('a')['data-filterid'].split('_')[-1]

    filters.append(filter_id)
    subjects.append(subject)

subject_data = zip(filters, subjects)
subject_df = pd.DataFrame(subject_data, columns=['subjectID', 'subject'])

journal_select = soup.find(attrs={'name': 'journal'})
journal_options =  journal_select.find_all('option')

for option in journal_options:
    journal_filters.append(option['value'])
    journals.append(option.text.strip())
    
journal_filters.pop(0)
journals.pop(0)

journal_data = zip(journal_filters, journals)
journal_df = pd.DataFrame(journal_data, columns=['journalID', 'journal'])

In [17]:
subject_df

Unnamed: 0,subjectID,subject
0,bio-life,Biology & Life Sciences
1,chem-materials,Chemistry & Materials Science
2,engineering,Engineering
3,environment,Environmental & Earth Sciences
4,med-pharma,Medicine & Pharmacology
5,health,Public Health & Healthcare
6,physics-astronomy,Physical Sciences
7,computer-math,Computer Science & Mathematics
8,arts-humanity,"Social Sciences, Arts and Humanities"
9,business-econ,Business & Economics


In [19]:
journal_df.to_csv('output/journal.csv', index=False)
journal_df

Unnamed: 0,journalID,journal
0,acoustics,Acoustics
1,amh,Acta Microbiologica Hellenica (AMH)
2,actuators,Actuators
3,admsci,Administrative Sciences
4,adolescents,Adolescents
...,...,...
435,women,Women
436,world,World
437,wevj,World Electric Vehicle Journal (WEVJ)
438,youth,Youth


In [5]:
def total_pages(url):
    with requests.get(url) as response:
        soup = BeautifulSoup(response.content, 'html.parser')
        
    
    pagination_info = soup.find('div', class_='columns large-6 medium-6 small-12')
    
    text = pagination_info.get_text(strip=True)
    total_pages_str = text.split()[-1].rstrip('.')
    total_pages = int(total_pages_str)
    
    return total_pages

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    }
year = datetime.now().year

In [7]:
pages = []
base_url = f"https://www.mdpi.com/search?sort=pubdate&page_count=10&year_from=2024&year_to={year}"

for filter in subject_df['Filter']:
    url = f"{base_url}&subjects={filter}"
    last_pagination = total_pages(url)
    pages.append(last_pagination)

subject_df["Last Page"] = pages

In [8]:
subject_df.sort_values(by='Last Page', ascending=True, inplace=True)
subject_df

Unnamed: 0,Filter,Subject,Last Page
9,business-econ,Business & Economics,591
8,arts-humanity,"Social Sciences, Arts and Humanities",710
7,computer-math,Computer Science & Mathematics,1351
6,physics-astronomy,Physical Sciences,1562
5,health,Public Health & Healthcare,1706
4,med-pharma,Medicine & Pharmacology,2601
3,environment,Environmental & Earth Sciences,2704
2,engineering,Engineering,3075
1,chem-materials,Chemistry & Materials Science,3302
0,bio-life,Biology & Life Sciences,3446


In [9]:
def mdpi_crawler(headers, subject, last_pagination):
    titles, authors, abstracts, journals, pubdates = [], [], [], [], []
    for page in range(1, last_pagination + 1):
        url = f"https://www.mdpi.com/search?sort=pubdate&page_no={page}&page_count=10&year_from=2024&year_to={year}&subjects={subject}"
        with requests.get(url, headers=headers) as response:    
            # Parse the content as HTML
            page_mdpi = BeautifulSoup(response.content, 'html.parser')
            
        article_boxes = page_mdpi.find_all('div', class_='generic-item article-item')
        for article_box in article_boxes:
            data_name = article_box.find('a', class_='UD_Listings_ArticlePDF')['data-name']
            titles.append(data_name)
            
            data_journal = article_box.find('a', class_='UD_Listings_ArticlePDF')['data-journal']
            journals.append(data_journal)
            
            author = article_box.find('div', class_='authors').text.strip()
            authors.append(author)
            
            abstract_full = article_box.find('div', class_='abstract-full').text.strip()
            abstracts.append(abstract_full)
            
            pubdate = article_box.find('div', class_='color-grey-dark').text.strip()
            pubdates.append(pubdate)
    return titles, authors, abstracts, journals, pubdates


In [None]:
for index, row in subject_df.iterrows():
    filter_id = row['Filter']
    subject = row['Subject']
    last_pagination = row['Last Page']
    
    titles, authors, abstracts, journals, pubdates = [], [], [], [], []
    
    for page in range(1, last_pagination + 1):
        url = f"https://www.mdpi.com/search?sort=pubdate&page_no={page}&page_count=10&year_from=2024&year_to=2024&subjects={filter_id}"
        with requests.get(url, headers=headers) as response:    
            # Parse the content as HTML
            page_mdpi = BeautifulSoup(response.content, 'html.parser')
            
        article_boxes = page_mdpi.find_all('div', class_='generic-item article-item')
        for article_box in article_boxes:
            data_name = article_box.find('a', class_='UD_Listings_ArticlePDF')['data-name']
            titles.append(data_name)
            
            data_journal = article_box.find('a', class_='UD_Listings_ArticlePDF')['data-journal']
            journals.append(data_journal)
            
            author = article_box.find('div', class_='authors').text.strip()
            authors.append(author)
            
            abstract_full = article_box.find('div', class_='abstract-full').text.strip()
            abstracts.append(abstract_full)
            
            pubdate = article_box.find('div', class_='color-grey-dark').text.strip()
            pubdates.append(pubdate)
        
    data = {
        'title': titles,
        'author': authors,
        'subject': [subject] * len(titles),
        'abstract': abstracts,
        'journal': journals,
        'pubdate': pubdates
    }

    subname = filter_id.replace('-', '_')
    df_name = f"{subname}"
    globals()[df_name] = pd.DataFrame(data)
    
    output_folder = "output/papers"
    os.makedirs(output_folder, exist_ok=True)
    csv_filename = os.path.join(output_folder, f"{subname}.csv")  
    globals()[df_name].to_csv(csv_filename, index=False)