<a href="https://colab.research.google.com/github/nerudxlf/scival-parser/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup, Tag
%pip install lxml



In [7]:
class DataDto:
  def __init__(self, cluster_name, cluster_id, scholarly_output, publication_share, fwci, prominence_percentile):
    self.cluster_name = cluster_name
    self.cluster_id = cluster_id
    self.scholarly_output = scholarly_output.replace(",", "")
    self.publication_share = publication_share
    self.fwci = fwci
    self.prominence_percentile = prominence_percentile


class Scraper:
  _soup: BeautifulSoup = None

  def __init__(self, text):
    self.html_text = text
    self._soup = BeautifulSoup(self.html_text, "lxml")

  def get_list_table_row(self):
    return self._soup.find_all("div", {"class": "tableRow panelRow ui-draggable ui-draggable-handle"})

  @staticmethod
  def _get_topic_cluster_name(div: Tag):
    return div.find("div", {"class": "keywords nowrap"}).text

  @staticmethod
  def _get_topic_cluster_id(div: Tag):
    return div.find("div", {"class": "topicId"}).text

  @staticmethod
  def _get_scholarly_output(div: Tag):
    return div.find("button", {"class": "link primary-link showPublications"}).text

  @staticmethod
  def _get_publication_share(div: Tag):
    return div.find_all("div", {"class": "tableCell number"})[1].text

  @staticmethod
  def _get_fwci(div: Tag):
    return div.find_all("div", {"class": "tableCell number"})[2].text

  @staticmethod
  def _get_prominence_percentile(div: Tag):
    return div.find("div", {"class": "tableCell prominence"}).find("button", {"class": "link primary-link percentileVal"}).text

  @staticmethod
  def to_data_frame(data_list) -> DataFrame:
    list_topic_cluster_name = []
    list_topic_cluster_id = []
    list_scholarly_output = []
    list_publication_share = []
    list_fwci = []
    list_prominence_percentile = []
    list_info = []
    for data in data_list:
      list_topic_cluster_name.append(data.cluster_name)
      list_topic_cluster_id.append(data.cluster_id)
      list_scholarly_output.append(data.scholarly_output)
      publications_share_value, info = data.publication_share.split()
      list_publication_share.append(publications_share_value)
      list_info.append(info)
      list_fwci.append(data.fwci)
      list_prominence_percentile.append(data.prominence_percentile)
    return pd.DataFrame({
        "Topic Cluster": list_topic_cluster_name,
        "Topic Cluster Number": list_topic_cluster_id,
        "Scholarly Output": list_scholarly_output,
        "Publication share (%)": list_publication_share,
        "Publication Share growth (%)": list_info,
        "Field-Weighted Citation Impact": list_fwci,
        "Prominence percentile": list_prominence_percentile
    })

  def start(self) -> DataFrame:
    return_list = []
    for i in self.get_list_table_row():
      return_list.append(DataDto(
        self._get_topic_cluster_name(i),
        self._get_topic_cluster_id(i),
        self._get_scholarly_output(i),
        self._get_publication_share(i),
        self._get_fwci(i),
        self._get_prominence_percentile(i)
      ))
    return self.to_data_frame(return_list)


In [8]:
name = "Япония"
with open(f"{name}.html", "r", encoding="utf-8") as f:
  text = f.read()
  df = Scraper(text).start()
  df.to_excel(f"{name}.xlsx", index=False)