<a href="https://colab.research.google.com/github/nerudxlf/publication-data-by-departments/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas==1.1.0
import pandas as pd
from pandas import DataFrame



In [12]:

def merge_two_dicts(x: dict, y: dict) -> dict:
  """
  :param x: исходный словарь
  :param y: словарь для объединения со словарем x
  :return: возвращает объединенный словарь
  """
  z = x.copy()
  z.update(y)
  return z


class Article:
  def __init__(
          self,
          title: str,
          authors: str,
          source: str,
          document_type: str,
          publication_year: str,
          addresses: str,
  ):
    self.title = title
    self.authors = authors
    self.source = source
    self.document_type = document_type
    self.publication_year = publication_year
    self.addresses = addresses

  def get_authors_wos(self, ) -> list:
    return self.authors.split("; ")

  def get_author_scopus(self) -> list:
    return self.authors.split(", ")

  def __str__(self):
    return f"Title {self.title}\n" \
          f"Authors {self.authors}\n" \
          f"Source {self.source}\n" \
          f"Document Type {self.document_type}\n" \
          f"Addresses {self.addresses}\n" \
          f"Publication Year {self.publication_year}"

  def __repr__(self):
    return f"Title {self.title}\n" \
          f"Authors {self.authors}\n" \
          f"Source {self.source}\n" \
          f"Document Type {self.document_type}\n" \
          f"Addresses {self.addresses}\n" \
          f"Publication Year {self.publication_year}"

  def __iter__(self):
    for i in list(self.__dict__.values()):
      yield i


class DataDistribution:
  def __init__(self, dictionary_df: DataFrame, employees_df: DataFrame, data_df: DataFrame):
    self.dictionary_df = dict(zip(dictionary_df["Сотрудник"].to_list(), dictionary_df["names"].to_list()))
    self.employees = employees_df
    self.data_df = data_df

  def find_in_dictionary(self, names: str):
    for keys, values in self.dictionary_df.items():
      if values.find(names.lower()) != -1:
        return keys
    else:
      return None

  def get_employees_dict(self) -> dict:
    department_list = self.employees["Подразделение"].to_list()
    names_list = self.employees["ФИО"].to_list()
    result_dict = {}
    tmp_dict = {}
    for i in range(len(department_list)):
      if department_list[i].find("Кафедра") != -1:
        if tmp_dict.get(department_list[i]):
          tmp_dict[department_list[i]].append(names_list[i])
        else:
          tmp_dict = merge_two_dicts(tmp_dict, {department_list[i]: [names_list[i]]})
    for key, value in tmp_dict.items():
      result_dict = merge_two_dicts(result_dict, {key: list(set(value))})
    return result_dict

  def get_article_list(self):
    pass

  def count_data(self):
    pass


class DataDistributionWos(DataDistribution):
  def get_articles_list(self):
    articles_list = []
    titles_list = self.data_df["Article Title"].to_list()
    authors_list = self.data_df["Authors"].to_list()
    source_list = self.data_df["Source Title"].to_list()
    document_type_list = self.data_df["Document Type"].to_list()
    publication_year_list = self.data_df["Publication Year"].to_list()
    addresses_list = self.data_df["Addresses"].to_list()
    for i in range(len(titles_list)):
      article = Article(
      title=titles_list[i],
      authors=authors_list[i],
      source=source_list[i],
      document_type=document_type_list[i],
      publication_year=publication_year_list[i],
      addresses=addresses_list[i],
      )
      articles_list.append(article)
    return articles_list

  def count_data(self):
    result_dict_df = {}
    employees_dict = self.get_employees_dict()
    articles_list = self.get_articles_list()
    for keys, values in employees_dict.items():
      result_dict = {"Автор": [], "Title": [], "Authors": [], "Source Title": [], "Document Type": [],
                    "Publication Year": [], "Addresses": []}
      for article in articles_list:
        authors_from_article = article.get_authors_wos()
        for i in authors_from_article:
          name = self.find_in_dictionary(i)
          if not name or not (name in values):
            continue
          article_dict = vars(article)
          result_dict["Автор"].append(name)
          result_dict["Authors"].append(article_dict["authors"])
          result_dict["Title"].append(article_dict["title"])
          result_dict["Source Title"].append(article_dict["source"])
          result_dict["Document Type"].append(article_dict["document_type"])
          result_dict["Publication Year"].append(article_dict["publication_year"])
          result_dict["Addresses"].append(article_dict["addresses"])
      department_df = pd.DataFrame({
                "Автор": result_dict["Автор"],
                "Title": result_dict["Title"],
                "Authors": result_dict["Authors"],
                "Source Title": result_dict["Source Title"],
                "Document Type": result_dict["Document Type"],
                "Publication Year": result_dict["Publication Year"],
                "Addresses": result_dict["Addresses"],
      })
      result_dict_df = merge_two_dicts({keys: department_df}, result_dict_df)
    return result_dict_df


class DataDistributionScopus(DataDistribution):
  def get_articles_list(self):
    articles_list = []
    titles_list = self.data_df["Название"].to_list()
    authors_list = self.data_df["Авторы"].to_list()
    source_list = self.data_df["Название источника"].to_list()
    document_type_list = self.data_df["Тип документа"].to_list()
    publication_year_list = self.data_df["Год"].to_list()
    addresses_list = self.data_df["Организации"].to_list()
    for i in range(len(titles_list)):
      article = Article(
            title=titles_list[i],
            authors=authors_list[i],
            source=source_list[i],
            document_type=document_type_list[i],
            publication_year=publication_year_list[i],
            addresses=addresses_list[i],
      )
      articles_list.append(article)
    return articles_list

  def count_data(self):
    result_dict_df = {}
    employees_dict = self.get_employees_dict()
    articles_list = self.get_articles_list()
    for keys, values in employees_dict.items():
      result_dict = {"Автор": [], "Title": [], "Authors": [], "Source Title": [], "Document Type": [],
                      "Publication Year": [], "Addresses": []}
      for article in articles_list:
        authors_from_article = article.get_author_scopus()
        for i in authors_from_article:
          name = self.find_in_dictionary(i)
          if not name or not (name in values):
             continue
          article_dict = vars(article)
          result_dict["Автор"].append(name)
          result_dict["Authors"].append(article_dict["authors"])
          result_dict["Title"].append(article_dict["title"])
          result_dict["Source Title"].append(article_dict["source"])
          result_dict["Document Type"].append(article_dict["document_type"])
          result_dict["Publication Year"].append(article_dict["publication_year"])
          result_dict["Addresses"].append(article_dict["addresses"])
      department_df = pd.DataFrame({
                "Автор": result_dict["Автор"],
                "Title": result_dict["Title"],
                "Authors": result_dict["Authors"],
                "Source Title": result_dict["Source Title"],
                "Document Type": result_dict["Document Type"],
                "Publication Year": result_dict["Publication Year"],
                "Addresses": result_dict["Addresses"],
      })
      result_dict_df = merge_two_dicts(result_dict_df, {keys: department_df})
    return result_dict_df

In [13]:
data_df_wos = pd.read_excel("wos2021.xls")
data_df_scopus = pd.read_excel("Scopus2021.xlsx")
dictionary_df = pd.read_excel("dictionary.xlsx")
employees_df = pd.read_excel("Сотрудники.xls")
wos = DataDistributionWos(dictionary_df, employees_df, data_df_wos)
wos_dict = wos.count_data()
scopus = DataDistributionScopus(dictionary_df, employees_df, data_df_scopus)
scopus_dict = scopus.count_data()
for keys, values in wos_dict.items():
  keys = keys.replace('"', '')
  values.to_excel(f"WoS {keys}.xlsx", index=False)
for keys, values in scopus_dict.items():
  keys = keys.replace('"', '')
  values.to_excel(f"Scopus {keys}.xlsx", index=False)