# World Top CS Universities [2021]

## Import Modules

In [14]:
import csv
from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from statistics import mean

from normalize import alt2main, main2id

## Define Model

In [15]:
@dataclass
class Uni:
    """Represent university."""
    name: str
    rank: int

## Load Rankings

In [16]:
def format(rank: str) -> int:
    """Format rank as an integer."""
    if rank.startswith("="):
        return int(rank[1:])
    if rank.endswith("+"):
        return int(rank[:-1])
    if "–" in rank:
        return mean([int(val) for val in rank.split("–")])
    if "-" in rank:
        return mean([int(val) for val in rank.split("-")])
    return int(rank)

def normalize(name: str) -> str:
    """Normalize university names."""
    if name.startswith("The "):
        name = name[4:]
    if name in alt2main:
        return alt2main[name]
    return name

def load_ranking(path: str, delimiter: str = None, limit: int = None) -> list[Uni]:
    """Load ranking."""
    with open(path) as file:
        reader = csv.reader(file, delimiter=delimiter)
        next(reader)  # Skip header row
        return [Uni(normalize(name), format(rank)) for name, rank in islice(reader, limit)]

# Load universities
ranking_the = load_ranking("ranking_scraper/the_cs_2022.csv", ",", 100)
ranking_qs = load_ranking("ranking_scraper/qs_cs_2021.csv", ",", 100)
ranking_arwu = load_ranking("ranking_scraper/arwu_cs_2021.csv", ",", 100)

## Preview Rankings

In [17]:
def preview_ranking(ranking: list[Uni], limit: int = 10) -> None:
    """Preview ranking."""
    # Print first universities in ranking
    for i in range(limit):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    # Print last universities in ranking
    for i in range(len(ranking) - limit, len(ranking)):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print(
        f"-----------------------------\n"
        f"Length: {len(ranking)}"
    )

In [18]:
preview_ranking(ranking_the)

University of Oxford	1
Stanford University	2
Massachusetts Institute of Technology (MIT)	3
University of Cambridge	4
ETH Zurich	4
Carnegie Mellon University	6
Harvard University	7
University of California, Berkeley	8
National University of Singapore (NUS)	8
Princeton University	10
...
Tokyo Institute of Technology (Tokyo Tech)	88
University of Southampton	91
University of Birmingham	92
University of Luxembourg	93
Monash University	93
University of Zurich	93
Vrije Universiteit Amsterdam	96
Technion Israel Institute of Technology	97
TU Wien	97
Queen Mary University of London	99
-----------------------------
Length: 100


In [19]:
preview_ranking(ranking_qs)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
Carnegie Mellon University	3
National University of Singapore (NUS)	4
University of California, Berkeley	5
University of Oxford	6
Harvard University	7
University of Cambridge	8
Swiss Federal Institute of Technology Lausanne (EPFL)	9
ETH Zurich	10
...
University of Wisconsin–Madison	89
Boston University	92
Karlsruhe Institute of Technology (KIT)	92
Universidad Nacional Autónoma de México (UNAM)	94
University of Massachusetts (UMass)	95
Aalto University	96
University of Warwick	96
Université catholique de Louvain (UCL)	96
Aarhus University	99
University of Adelaide	99
-----------------------------
Length: 100


In [20]:
preview_ranking(ranking_arwu)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
University of California, Berkeley	3
Carnegie Mellon University	4
ETH Zurich	5
Tsinghua University	6
Harvard University	7
Nanyang Technological University (NTU)	8
University of Oxford	9
University of California, Los Angeles (UCLA)	10
...
Sichuan University	88
Swinburne University of Technology	88
Technical University of Denmark	88
University of Hong Kong	88
University of Melbourne	88
Tongji University	88
Université Grenoble Alpes	88
University of California, Merced	88
University of Macau	88
University of Pennsylvania	88
-----------------------------
Length: 100


## Clean Rankings

Sometimes universities have different names in different rankings. For example, one ranking might use universities' full names while another might prefer acronyms. This might cause duplicates down the line.

To avoid this issue, let's save all university names sorted alphabetically and try to spot duplicates. They typically appear next to each other, since they differ only slightly. Once we spot a duplicate, we pick the preferred name and add it to `normalize.py` as well as its alt-names, and we repeat this process until no duplicates are left.

In [21]:
unis_qs = {uni.name for uni in ranking_qs}
unis_the = {uni.name for uni in ranking_the}
unis_arwu = {uni.name for uni in ranking_arwu}

unis_all = unis_qs | unis_the | unis_arwu

with open("data/cs-2022/universities_sorted.tsv", "w") as file:
    for uni in sorted(unis_all):
        file.write(f"{uni}\n")
    file.write(f"Length: {len(unis_all)}")

## Combine Rankings

In [22]:
rankings = [ranking_qs, ranking_the, ranking_arwu]

uni_ranks = defaultdict(list)
for ranking in rankings:
    for uni in ranking:
        uni_ranks[uni.name].append(uni.rank)

uni_ranks = dict(uni_ranks)

In [23]:
# Fill missing ranks
for _, ranks in uni_ranks.items():
    while len(ranks) < 3:
            ranks.append(101) # Best possible rank when not in top-100

In [24]:
# Average ranks
ranking_all = [Uni(name, mean(ranks)) for name, ranks in uni_ranks.items()]

## Save Final Ranking

In [25]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file and preview it."""
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    with open("data/cs-2022/ranking_global_cs_2022.tsv", "w") as file:
        for i, uni in enumerate(ranking_sorted, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}"
            file.write(f"{row}\n")
            print(row)  # Preview
            if i == limit:
                break

save_ranking(ranking_all, 60)

1	Massachusetts Institute of Technology (MIT)	1.67
2	Stanford University	2.00
3	Carnegie Mellon University	4.33
4	University of California, Berkeley	5.33
5	University of Oxford	5.33
6	ETH Zurich	6.33
7	Harvard University	7.00
8	National University of Singapore (NUS)	9.00
9	Tsinghua University	10.33
10	Nanyang Technological University (NTU)	11.67
11	Princeton University	12.67
12	University of California, Los Angeles (UCLA)	12.67
13	University of Toronto	16.00
14	Cornell University	18.00
15	Peking University	19.67
16	University of Cambridge	20.67
17	Imperial College London	22.00
18	University of Washington	22.33
19	Shanghai Jiao Tong University	25.00
20	University College London (UCL)	26.00
21	University of Edinburgh	26.00
22	Columbia University	28.00
23	New York University (NYU)	28.33
24	Georgia Institute of Technology (Georgia Tech)	29.67
25	Swiss Federal Institute of Technology Lausanne (EPFL)	30.33
26	University of Texas at Austin	30.67
27	Chinese University of Hong Kong (CUHK)	31.33

## Get IDs (Helper)

In [26]:
def get_ids(ranking: list[Uni], limit: int = None) -> list[int]:
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    ranking_sorted_ids = []
    unis_without_id = []
    for uni in ranking_sorted[:limit]:
        if uni.name in main2id:
            ranking_sorted_ids.append(main2id[uni.name])
        else:
            unis_without_id.append(uni.name)
    # print("Ranked unis ids:", ranking_sorted_ids)
    print("Unis without id:", unis_without_id)
    return ranking_sorted_ids

top_50_cs = get_ids(ranking_all, 60)[:50]
print("$top50cs = ", top_50_cs)

Unis without id: ['University of Southern California']
$top50cs =  [39, 1, 40, 3, 1092, 288, 38, 72, 168, 170, 6, 692, 8, 225, 159, 978, 1043, 15, 238, 384, 14, 33, 483, 12, 16, 86, 48, 2, 31, 2027, 74, 17, 2444, 11, 4, 1458, 49, 5, 813, 399, 930, 977, 99, 22, 69, 888, 292, 405, 56, 70]
