In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os

In [2]:
base_url = f"https://www.mdpi.com/about/journals"

with requests.get(base_url) as response:
    soup = BeautifulSoup(response.content, 'html.parser')
    
journal_table = soup.find('table')
headers = [th.text for th in journal_table.find_all('th')]

# Extract table rows
data = []
for row in journal_table.find_all('tr')[1:]:  # Start from 1 to skip header row
    data.append([td.text for td in row.find_all('td')])

# Create a DataFrame
journal = pd.DataFrame(data, columns=headers)
journal['Journal Name'] = journal['Journal Name'].str.strip().str.split('\n').str[-1]
journal = journal.rename(columns={'Journal Name': 'Journal ID'})
journal_df = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/journalD.csv")

journal_df.reset_index(drop=True, inplace=True)
journal.reset_index(drop=True, inplace=True)

journal = pd.merge(journal, journal_df, left_on="Journal ID", right_on="journalID", how="left")
journal.drop(columns=['journalID'], inplace=True)
journal.rename(columns={'journal': 'Journal'}, inplace=True)

cols = list(journal.columns)

# Xác định vị trí của cột 'Journal ID'
journal_id_index = cols.index('Journal ID')

# Loại bỏ cột 'Journal' khỏi danh sách cột ban đầu
cols.remove('Journal')

# Tạo danh sách cột mới với 'Journal' đứng ngay sau 'Journal ID'
new_order = cols[:journal_id_index + 1] + ['Journal'] + cols[journal_id_index + 1:]

# Sắp xếp lại các cột trong DataFrame theo thứ tự mới
journal = journal[new_order]
journal.reset_index(drop=True, inplace=True)
journal

Unnamed: 0,#,Journal ID,Journal,ISSN,Launched,IF,CiteScore,Current Issue,Upcoming Articles,Total Articles,RSS
0,1.,acoustics,Acoustics,\n 2624-599X\n,\n 2019\n,\n\n 1.3\n ...,\n\n 3.7\n ...,"\n\n v.6(2), Jun 2024\n ...",\n 0\n ...,\n306\n,\n\nrss_feed\n\n
1,2.,amh,Acta Microbiologica Hellenica (AMH),\n 2813-9054\n,\n 1956\n,\n -\n ...,\n\n 0.2\n ...,"\n\n v.69(2), Jun 2024\n ...",\n 0\n ...,\n12\n,\n\nrss_feed\n\n
2,3.,actuators,Actuators,\n 2076-0825\n,\n 2012\n,\n\n 2.2\n ...,\n\n 3.9\n ...,"\n\n v.13(6), Jun 2024\n ...",\n 3\n ...,"\n1,841\n",\n\nrss_feed\n\n
3,4.,admsci,Administrative Sciences,\n 2076-3387\n,\n 2011\n,\n\n 3.0\n ...,\n\n 4.8\n ...,"\n\n v.14(6), Jun 2024\n ...",\n 1\n ...,"\n1,154\n",\n\nrss_feed\n\n
4,5.,adolescents,Adolescents,\n 2673-7051\n,\n 2021\n,\n -\n ...,\n\n 1.3\n ...,"\n\n v.4(2), Jun 2024\n ...",\n 0\n ...,\n148\n,\n\nrss_feed\n\n
...,...,...,...,...,...,...,...,...,...,...,...
439,440.,women,Women,\n 2673-4184\n,\n 2021\n,\n -\n ...,\n -\n ...,"\n\n v.4(2), Jun 2024\n ...",\n 0\n ...,\n116\n,\n\nrss_feed\n\n
440,441.,world,World,\n 2673-4060\n,\n 2020\n,\n\n 2.0\n ...,\n -\n ...,"\n\n v.5(2), Jun 2024\n ...",\n 0\n ...,\n190\n,\n\nrss_feed\n\n
441,442.,wevj,World Electric Vehicle Journal (WEVJ),\n 2032-6653\n,\n 2007\n,\n\n 2.6\n ...,\n\n 4.5\n ...,"\n\n v.15(6), Jun 2024\n ...",\n 4\n ...,"\n2,114\n",\n\nrss_feed\n\n
442,443.,youth,Youth,\n 2673-995X\n,\n 2021\n,\n -\n ...,\n -\n ...,"\n\n v.4(2), Jun 2024\n ...",\n 1\n ...,\n210\n,\n\nrss_feed\n\n


In [3]:
output_folder = "D:/trang/doan/DoAn1/mdpi_crawler/data/staging1"
os.makedirs(output_folder, exist_ok=True)
csv_filename = os.path.join(output_folder, "journal.csv")
journal.to_csv(csv_filename, index=False)

In [4]:
arts_humanity = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/arts_humanity.csv")
bio_life = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/bio_life.csv")
business_econ = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/business_econ.csv")
chem_materials = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/chem_materials.csv")
computer_math = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/computer_math.csv")
engineering = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/engineering.csv")
environment = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/environment.csv")
health = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/health.csv")
med_pharma = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/med_pharma.csv")
physics_astronomy = pd.read_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/papers/physics_astronomy.csv")

In [5]:
paper = pd.concat([arts_humanity, bio_life, business_econ, chem_materials, computer_math, engineering, environment, health, med_pharma, physics_astronomy])
paper.reset_index(drop=True, inplace=True)
paper.to_csv('D:/trang/doan/DoAn1/mdpi_crawler/data/staging1/paper.csv', index=False)
paper

Unnamed: 0,title,author,subject,abstract,journal,pubdate
0,A Feature Alignment Approach to Plural Realiza...,by\nStuart Davis and Matthew Pollock,"Social Sciences, Arts and Humanities","Using an optimality theoretic analysis, this s...",languages,"Languages 2024, 9(5), 166; https://doi.org/10...."
1,University Students’ Perception of the Dehesa ...,"by\nRebeca Guillén-Peñafiel, Ana María Hernánd...","Social Sciences, Arts and Humanities",The dehesas are one of the most emblematic lan...,sustainability,"Sustainability 2024, 16(9), 3843; https://doi...."
2,The Impact of Artificial Intelligence Replacin...,"by\nFei Cai, Jiashu Zhang and Lei Zhang","Social Sciences, Arts and Humanities",A growing number of organizations have used ar...,sustainability,"Sustainability 2024, 16(9), 3840; https://doi...."
3,Assessment for the Sustainable Development of ...,"by\nKuat Saparov, Miroslava Omirzakova, Aigul ...","Social Sciences, Arts and Humanities",The assessment of sustainable tourism developm...,sustainability,"Sustainability 2024, 16(9), 3838; https://doi...."
4,Drivers of Spontaneous Plant Communities in Ur...,"by\nWenjie Xu, Wenjing Dai, Yanfen Ding, Shans...","Social Sciences, Arts and Humanities",Urban plant diversity is one of the key elemen...,sustainability,"Sustainability 2024, 16(9), 3841; https://doi...."
...,...,...,...,...,...,...
209650,Electrochromic Polymers: From Electrodepositio...,by\nHadarou Sare and Dongmei Dong,Physical Sciences,This paper reports on the linear colorimetric ...,energies,"Energies 2024, 17(1), 232; https://doi.org/10...."
209651,Chirality in Atomically Thin CdSe Nanoplatelet...,"by\nDaria A. Kurtina, Vladimir B. Zaytsev and ...",Physical Sciences,Chiral semiconductor nanostructures and nanopa...,materials,"Materials 2024, 17(1), 237; https://doi.org/10..."
209652,The Spatial Pattern and Influencing Factors of...,"by\nGuodong Yan, Lin Zou and Yunan Liu",Physical Sciences,The nighttime economy (NTE) is one of the prim...,applsci,"Appl. Sci. 2024, 14(1), 400; https://doi.org/1..."
209653,Demagnetization Fault Diagnosis of a PMSM for ...,"by\nQingxue Zhang, Junguo Cui, Wensheng Xiao, ...",Physical Sciences,Permanent magnets (PMs) provide high efficienc...,electronics,"Electronics 2024, 13(1), 189; https://doi.org/..."


In [6]:
base_url = f"https://www.mdpi.com/search?sort=pubdate&page_count=10&year_from=2024&year_to=2024"
filters = []
subjects = []
with requests.get(base_url) as response:
    soup = BeautifulSoup(response.content, 'html.parser')

subject_div = soup.find('div', class_='filter-container-subjects')
subject_boxes = subject_div.find_all('div', class_='remove-filter-container')

for div in subject_boxes:
    subject = div.find('label').text.strip()
    filter_id = div.find('a')['data-filterid'].split('_')[-1]

    filters.append(filter_id)
    subjects.append(subject)

subject_data = zip(filters, subjects)
subject_df = pd.DataFrame(subject_data, columns=['subjectID', 'subject'])
subject_df.reset_index(drop=True,  inplace=True)
subject_df.to_csv("D:/trang/doan/DoAn1/mdpi_crawler/output/subject.csv", index=False)

In [7]:
subject_df

Unnamed: 0,subjectID,subject
0,bio-life,Biology & Life Sciences
1,chem-materials,Chemistry & Materials Science
2,engineering,Engineering
3,environment,Environmental & Earth Sciences
4,med-pharma,Medicine & Pharmacology
5,health,Public Health & Healthcare
6,physics-astronomy,Physical Sciences
7,computer-math,Computer Science & Mathematics
8,arts-humanity,"Social Sciences, Arts and Humanities"
9,business-econ,Business & Economics
