# World Top CS Universities [2021]

## Import Modules

In [62]:
import csv
from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from statistics import mean

## Define Model

In [63]:
@dataclass
class Uni:
    """Represent university."""
    name: str
    rank: int

## Load Rankings

In [64]:
def format(rank: str) -> int:
    """Format rank as an integer."""
    if rank.startswith("="):
        return int(rank[1:])
    if rank.endswith("+"):
        return int(rank[:-1])
    if "–" in rank:
        return mean([int(val) for val in rank.split("–")])
    if "-" in rank:
        return mean([int(val) for val in rank.split("-")])
    return int(rank)

def load_ranking(path: str, delimiter: str = None, limit: int = None) -> list[Uni]:
    """Load ranking."""
    with open(path) as file:
        reader = csv.reader(file, delimiter=delimiter)
        next(reader)  # Skip header row
        return [Uni(name, format(rank)) for name, rank in islice(reader, limit)]

# Load universities
ranking_the = load_ranking("ranking_scraper/the_cs_2022.csv", ",")
ranking_qs = load_ranking("ranking_scraper/qs_cs_2021.csv", ",")
ranking_arwu = load_ranking("ranking_scraper/arwu_cs_2021.csv", ",")

## Preview Rankings

In [65]:
def preview_ranking(ranking: list[Uni], limit: int = 10) -> None:
    """Preview ranking."""
    # Print first universities in ranking
    for i in range(limit):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    # Print last universities in ranking
    for i in range(len(ranking) - limit, len(ranking)):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print(
        f"-----------------------------\n"
        f"Length: {len(ranking)}"
    )

In [66]:
preview_ranking(ranking_the)

University of Oxford	1
Stanford University	2
Massachusetts Institute of Technology	3
University of Cambridge	4
ETH Zurich	4
Carnegie Mellon University	6
Harvard University	7
University of California, Berkeley	8
National University of Singapore	8
Princeton University	10
...
Tokyo University of Agriculture and Technology	801
Tokyo Denki University	801
Tokyo University of Science	801
Toyohashi University of Technology (TUT)	801
Tshwane University of Technology	801
Vilnius University	801
Wuhan University of Technology	801
Yamaguchi University	801
Yanshan University	801
Yuan Ze University	801
-----------------------------
Length: 892


In [67]:
preview_ranking(ranking_qs)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
Carnegie Mellon University	3
National University of Singapore (NUS)	4
University of California, Berkeley (UCB)	5
University of Oxford	6
Harvard University	7
University of Cambridge	8
EPFL	9
ETH Zurich - Swiss Federal Institute of Technology	10
...
University of Jyväskylä	625.5
Université de Liège	625.5
University of Maryland, Baltimore County	625.5
University of Massachusetts Boston	625.5
University of Minho	625.5
University of New Mexico	625.5
University of Reading	625.5
University of Saskatchewan	625.5
University of Zagreb	625.5
Vietnam National University, Hanoi	625.5
-----------------------------
Length: 651


In [68]:
preview_ranking(ranking_arwu)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
University of California, Berkeley	3
Carnegie Mellon University	4
ETH Zurich	5
Tsinghua University	6
Harvard University	7
Nanyang Technological University	8
University of Oxford	9
University of California, Los Angeles	10
...
University of Tennessee - Knoxville	450.5
University of the Basque Country	450.5
University of Tuebingen	450.5
University of Windsor	450.5
University of Wuerzburg	450.5
Vrije Universiteit Brussel (VUB)	450.5
Wayne State University	450.5
Zhejiang Gongshang University	450.5
Zhejiang Normal University	450.5
Zhengzhou University	450.5
-----------------------------
Length: 497


## Clean Rankings

Sometimes universities have different names in different rankings. For example, one ranking might use universities' full names while another might prefer acronyms. This might cause duplicates down the line.

To avoid this issue, let's save all university names sorted alphabetically and try to spot duplicates. They typically appear next to each other, since they differ only slightly. Once we spot a duplicate, we pick the preferred name, update our TSV rankings, and repeat this process until no duplicates are left.

In [None]:
unis_qs = {uni.name for uni in ranking_qs}
unis_the = {uni.name for uni in ranking_the}
unis_arwu = {uni.name for uni in ranking_arwu}

unis_all = unis_qs | unis_the | unis_arwu

with open("data/cs-2021/universities_sorted.tsv", "w") as file:
    for uni in sorted(unis_all):
        file.write(f"{uni}\n")
    file.write(f"Length: {len(unis_all)}")

## Combine Rankings

In [None]:
rankings = [ranking_qs, ranking_the, ranking_arwu]

uni_ranks = defaultdict(list)
for ranking in rankings:
    for uni in ranking:
        uni_ranks[uni.name].append(uni.rank)

uni_ranks = dict(uni_ranks)

In [None]:
# Fill missing ranks
for _, ranks in uni_ranks.items():
    while len(ranks) < 3:
            ranks.append(51) # Best possible rank when not in top-50

In [None]:
# Average ranks
ranking_all = [Uni(name, mean(ranks)) for name, ranks in uni_ranks.items()]

## Save Final Ranking

In [None]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file and preview it."""
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    with open("data/cs-2021/ranking_global_cs_2021.tsv", "w") as file:
        for i, uni in enumerate(ranking_sorted, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}"
            file.write(f"{row}\n")
            print(row)  # Preview
            if i == limit:
                break

save_ranking(ranking_all, 60)