<a href="https://colab.research.google.com/github/nerudxlf/scival-cluster-analysis/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [7]:
class Counter:
  def __init__(
            self,
            cluster_names,
            cluster_fwci,
            prominence_percentile,
            cluster_proportion,
            scholarly_output,
  ):
    self.cluster_names = cluster_names
    self.cluster_fwci = cluster_fwci
    self.prominence_percentile = prominence_percentile
    self.cluster_proportion = cluster_proportion
    self.scholarly_output = scholarly_output

  def get_average_value_fwci(self):
    return sum(self.cluster_fwci) / len(self.cluster_names)

  def get_value_clusters(self):
    return len(self.cluster_names)

  def get_average_prominence_percentile(self):
    return sum(self.prominence_percentile) / len(self.cluster_names)

  def get_average_scholarly_output(self):
    return sum(self.scholarly_output) / len(self.cluster_names)

  def get_cluster_names(self):
    return "\n".join(self.cluster_names)

  def __str__(self):
    return f"Average FWCI: {self.get_average_value_fwci()}\n" \
           f"Average Scholarly Output: {self.get_average_scholarly_output()}\n" \
           f"Total: {self.get_value_clusters()}\n" \
           f"Prominence percentile: {self.get_average_prominence_percentile()}\n"


class ClusterAnalysis:
  def __init__(self, scopus_df, fwci):
    self.topic_cluster = scopus_df['Topic Cluster'].to_list()
    self.topic_cluster_number = scopus_df['Topic Cluster Number'].to_list()
    self.scholarly_output = list(map(int, scopus_df['Scholarly Output'].to_list()))
    self.publication_share = scopus_df['Publication share (%)'].to_list()
    self.publication_share_growth = scopus_df['Publication Share growth (%)'].to_list()
    self.fwci = list(map(float, scopus_df['Field-Weighted Citation Impact'].to_list()))
    self.prominence_percentile = list(map(float, scopus_df['Prominence percentile'].to_list()))
    self.fwci_university_value = fwci

  def get_average_fwci(self):
    return sum(np.array(self.fwci) * np.array(self.scholarly_output)) / sum(self.scholarly_output)

  def get_proportion(self):
    return list(np.array(self.scholarly_output) / sum(self.scholarly_output))

  def get_d(self, fwci_average, proportion_list, value_so = 0):
    cluster_names = []
    cluster_fwci = []
    prominence_percentile = []
    cluster_proportion = []
    scholarly_output = []
    for i in range(len(self.topic_cluster)):
      if self.fwci[i] > fwci_average and proportion_list[i] > 0.005:
        cluster_names.append(self.topic_cluster[i])
        cluster_fwci.append(self.fwci[i])
        prominence_percentile.append(self.prominence_percentile[i])
        cluster_proportion.append(proportion_list[i])
        scholarly_output.append((self.scholarly_output[i]))
    if value_so != 0:
      scholarly_output = [value_so]
    return Counter(cluster_names, cluster_fwci, prominence_percentile, cluster_proportion, scholarly_output)

  def get_e(self, fwci_average, proportion_list, value_so = 0):
    cluster_names = []
    cluster_fwci = []
    prominence_percentile = []
    cluster_proportion = []
    scholarly_output = []
    for i in range(len(self.topic_cluster)):
      if self.fwci[i] < fwci_average and proportion_list[i] > 0.005:
        cluster_names.append(self.topic_cluster[i])
        cluster_fwci.append(self.fwci[i])
        prominence_percentile.append(self.prominence_percentile[i])
        cluster_proportion.append(proportion_list[i])
        scholarly_output.append(self.scholarly_output[i])
    if value_so != 0:
      scholarly_output = [value_so]
    return Counter(cluster_names, cluster_fwci, prominence_percentile, cluster_proportion, scholarly_output)

  def get_g(self, fwci_average, proportion_list, value_so = 0) -> Counter:
    cluster_names = []
    cluster_fwci = []
    prominence_percentile = []
    cluster_proportion = []
    scholarly_output = []
    for i in range(len(self.topic_cluster)):
      if self.fwci[i] > fwci_average and proportion_list[i] < 0.005:
        cluster_names.append(self.topic_cluster[i])
        cluster_fwci.append(self.fwci[i])
        prominence_percentile.append(self.prominence_percentile[i])
        cluster_proportion.append(proportion_list[i])
        scholarly_output.append(self.scholarly_output[i])
    if value_so != 0:
      scholarly_output = [value_so]
    return Counter(cluster_names, cluster_fwci, prominence_percentile, cluster_proportion, scholarly_output)

  def get_f(self, fwci_average, proportion_list, value_so = 0) -> Counter:
    cluster_names = []
    cluster_fwci = []
    prominence_percentile = []
    cluster_proportion = []
    scholarly_output = []
    for i in range(len(self.topic_cluster)):
      if self.fwci[i] < fwci_average and proportion_list[i] < 0.005:
        cluster_names.append(self.topic_cluster[i])
        cluster_fwci.append(self.fwci[i])
        prominence_percentile.append(self.prominence_percentile[i])
        cluster_proportion.append(proportion_list[i])
        scholarly_output.append(self.scholarly_output[i])
    if value_so != 0:
      scholarly_output = [value_so]
    return Counter(cluster_names, cluster_fwci, prominence_percentile, cluster_proportion, scholarly_output)

In [None]:
df = pd.read_excel("scopus.xlsx")
fwci: float = 1
value_publication: int = 0


cl = ClusterAnalysis(df, fwci)
avr_fwci = cl.get_average_fwci()
proportion_list = cl.get_proportion()

counter_d: Counter = cl.get_d(avr_fwci, proportion_list, value_publication)
counter_e: Counter = cl.get_e(avr_fwci, proportion_list, value_publication)
counter_g: Counter = cl.get_g(avr_fwci, proportion_list, value_publication)
counter_f: Counter = cl.get_f(avr_fwci, proportion_list, value_publication)
print(f"D\n{counter_d}")
print(f"E\n{counter_e}")
print(f"G\n{counter_g}")
print(f"F\n{counter_f}")

print(counter_d.get_cluster_names(), "\n\n\n")
print(counter_e.get_cluster_names(), "\n\n\n")
print(counter_g.get_cluster_names(), "\n\n\n")
print(counter_f.get_cluster_names(), "\n\n\n")