<a href="https://colab.research.google.com/github/nerudxlf/translit_authors_omstu/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transliterate

In [4]:
import pandas as pd
from google.colab import files
from pandas import DataFrame
from transliterate.decorators import transliterate_function

files.upload()

{}

In [7]:
class UpdateDictionary:
    def __init__(self, dictionary: str):
        self.__dictionary = pd.read_excel(dictionary)

    @staticmethod
    def __spelling_options(fio: str) -> str:
        dictionary_letter = {
            "ja": "ya", "ya": "ja", "w": "v", "v": "w", "ts": "tz", "tz": "ts", "h": "kh", "zh": "j", "ij": "ii",
            "ju": "yu", "sch": "shch",
        }
        dictionary_symbol = {
            ".": "., ", '’': "'",
        }
        return_string = ""
        for key, item in dictionary_letter.items():
            get_fio_list = fio.split()
            if get_fio_list[0].find(key) != -1:
                return_string += f"{get_fio_list[0].replace(key, item)} {get_fio_list[1]}"
        for key, item in dictionary_symbol.items():
            if fio.find(key) != -1:
                return_string += f"{fio.replace(key, item, 1)}"
        return fio + return_string

    @staticmethod
    @transliterate_function(language_code='ru', reversed=True)
    def __translit_scopus(text):
        surname, name, *patronymic = text.split()
        try:
            patronymic = f"{patronymic[0][0]}."
        except IndexError:
            patronymic = ""
        return f"{surname} {name[0]}.{patronymic};".lower()

    @staticmethod
    @transliterate_function(language_code='ru', reversed=True)
    def __translit_wos(text):
        surname, name, *patronymic = text.split()
        try:
            patronymic = patronymic[0][0]
        except IndexError:
            patronymic = ""
        return f"{surname}, {name[0]}{patronymic};".lower()

    def add_translit_names(self) -> DataFrame:
        names_list = self.__dictionary["Сотрудник"].to_list()
        keys_list = self.__dictionary["names"].to_list()
        for i in range(len(names_list)):
            translit_name_scopus = self.__translit_scopus(names_list[i])
            translit_name_wos = self.__translit_wos(names_list[i])
            if isinstance(keys_list[i], float):
                keys_list[i] = self.__spelling_options(translit_name_scopus) + self.__spelling_options(
                    translit_name_wos)
                continue
            if keys_list[i].find(translit_name_scopus) == -1:
                keys_list[i] += self.__spelling_options(translit_name_scopus)
            if keys_list[i].find(translit_name_wos) == -1:
                keys_list[i] += self.__spelling_options(translit_name_wos)
        self.__dictionary["names"] = keys_list
        return self.__dictionary

    def to_excel(self, path: str):
        self.__dictionary.to_excel(path, index=False)


class ErrorValue:
    _total_authors = 0
    __error_list = []

    def __init__(self, path_to_table: str, path_to_dictionary: str):
        self.table_df = pd.read_excel(path_to_table)
        self.dictionary_names = pd.read_excel(path_to_dictionary)["names"].to_list()

    def get_authors(self) -> list:
        pass

    def get_error_list(self) -> list:
        self.__error_list = []
        authors_list = self.get_authors()
        dictionary_string = "".join(self.dictionary_names)
        for author in authors_list:
            if dictionary_string.find(author) == -1:
                self.__error_list.append(author)
        return self.__error_list

    def get_error_value(self) -> float:
        return round(len(self.__error_list) / self._total_authors * 100, 2)


class ErrorWos(ErrorValue):
    def get_authors(self) -> list:
        self._total_authors = 0
        result_list = []
        authors_list = self.table_df["Authors"].to_list()
        authors_with_affiliation = self.table_df["Addresses"].to_list()
        for i in range(len(authors_with_affiliation)):
            authors_list_split = authors_list[i].split("; ")
            authors_with_affiliation_split = authors_with_affiliation[i].split("; [")
            for j in authors_with_affiliation_split:
                if j.find("Omsk State Tech Univ") != -1:
                    authors_omstu = j.split("]")[0]
                    authors_omstu_split = authors_omstu.split("; ")
                    for elem in range(len(authors_omstu_split)):
                        self._total_authors += 1
                        result_list.append(authors_list_split[elem].lower()+";")
        return list(set(result_list))


class ErrorScopus(ErrorValue):
    def get_authors(self) -> list:
        self._total_authors = 0
        result_list = []
        authors_list = self.table_df["Authors with affiliations"].to_list()
        for authors in authors_list:
            for author in authors.split("; "):
                if author.find("Omsk State Technical University") != -1:
                    self._total_authors += 1
                    item_split = author.split(", ")
                    need_author = item_split[0] + " " + item_split[1] + ";"
                    result_list.append(need_author.lower())
        return list(set(result_list))

In [8]:
dictionary_update = UpdateDictionary("dictionary.xlsx")
dictionary_df = dictionary_update.add_translit_names()
dictionary_df.to_excel("dictionary_new.xlsx", index=False)
error_scopus = ErrorScopus("scopus_2020_2021.xlsx", "dictionary_new.xlsx")
error_wos = ErrorWos("wos_2020_2021.xls", "dictionary_new.xlsx")
error_scopus_list = error_scopus.get_error_list()
error_wos_list = error_wos.get_error_list()
error_wos_value = error_wos.get_error_value()
error_scopus_value = error_scopus.get_error_value()
error_wos_df = pd.DataFrame({"wos": error_wos_list})
error_scopus_df = pd.DataFrame({"scopus": error_scopus_list})
print(f"Wos Error: {error_wos_value}%")
print(f"Scopus Error: {error_scopus_value}%")
error_wos_df.to_excel("ErrorWos.xlsx")
error_scopus_df.to_excel("ErrorScopus.xlsx")

Wos Error: 30.38%
Scopus Error: 13.11%
