# World Top CS Universities [2021]

## Import Modules

In [28]:
import csv
from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from statistics import mean

from normalize import alt2main, main2id

## Define Model

In [29]:
@dataclass
class Uni:
    """Represent university."""
    name: str
    rank: int

## Load Rankings

In [30]:
def format(rank: str) -> int:
    """Format rank as an integer."""
    if rank.startswith("="):
        return int(rank[1:])
    if rank.endswith("+"):
        return int(rank[:-1])
    if "–" in rank:
        return mean([int(val) for val in rank.split("–")])
    if "-" in rank:
        return mean([int(val) for val in rank.split("-")])
    return int(rank)

def normalize(name: str) -> str:
    """Normalize university names."""
    if name.startswith("The "):
        name = name[4:]
    if name in alt2main:
        return alt2main[name]
    return name

def load_ranking(path: str, delimiter: str = None, limit: int = None) -> list[Uni]:
    """Load ranking."""
    with open(path) as file:
        reader = csv.reader(file, delimiter=delimiter)
        next(reader)  # Skip header row
        return [Uni(normalize(name), format(rank)) for name, rank in islice(reader, limit)]

# Load universities
ranking_the = load_ranking("ranking_scraper/the_math_2022.csv", ",", 100)
ranking_qs = load_ranking("ranking_scraper/qs_math_2021.csv", ",", 100)
ranking_arwu = load_ranking("ranking_scraper/arwu_math_2021.csv", ",", 100)

## Preview Rankings

In [31]:
def preview_ranking(ranking: list[Uni], limit: int = 10) -> None:
    """Preview ranking."""
    # Print first universities in ranking
    for i in range(limit):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    # Print last universities in ranking
    for i in range(len(ranking) - limit, len(ranking)):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print(
        f"-----------------------------\n"
        f"Length: {len(ranking)}"
    )

In [32]:
preview_ranking(ranking_the)

University of California, Berkeley	1
California Institute of Technology (Caltech)	2
Princeton University	2
Massachusetts Institute of Technology (MIT)	4
Stanford University	4
Harvard University	6
University of Cambridge	7
University of Oxford	8
ETH Zurich	9
Yale University	10
...
University of Zurich	90
Sapienza University of Rome	92
Free University of Berlin	93
King's College London	94
University of Bern	95
Lomonosov Moscow State University	95
Université Grenoble Alpes	97
University of Copenhagen	98
Durham University	99
KU Leuven	100
-----------------------------
Length: 100


In [33]:
preview_ranking(ranking_qs)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
Harvard University	3
University of Cambridge	4
University of Oxford	5
University of California, Berkeley	6
Princeton University	7
ETH Zurich	8
National University of Singapore (NUS)	9
University of California, Los Angeles (UCLA)	10
...
Pennsylvania State University	90
Moscow Institute of Physics and Technology (MIPT)	92
RWTH Aachen University	92
University of Queensland	92
HSE University	95
University of Bonn	96
Universidade de São Paulo	96
Ohio State University	98
University of Bath	99
University of Copenhagen	100
-----------------------------
Length: 100


In [34]:
preview_ranking(ranking_arwu)

Paris-Saclay University	1
Princeton University	2
Sorbonne University	3
University of Cambridge	4
University of Oxford	5
Stanford University	6
Massachusetts Institute of Technology (MIT)	7
New York University (NYU)	8
ETH Zurich	9
University of Texas at Austin	10
...
University of Florida	88
University of Granada	88
University of Pavia	88
University of Rennes 1	88
University of Roma - Tor Vergata	88
University of Virginia	88
University of Warsaw	88
University Paris Est Creteil	88
Wuhan University	88
Autonomous University of Madrid	125.5
-----------------------------
Length: 100


## Clean Rankings

Sometimes universities have different names in different rankings. For example, one ranking might use universities' full names while another might prefer acronyms. This might cause duplicates down the line.

To avoid this issue, let's save all university names sorted alphabetically and try to spot duplicates. They typically appear next to each other, since they differ only slightly. Once we spot a duplicate, we pick the preferred name and add it to `normalize.py` as well as its alt-names, and we repeat this process until no duplicates are left.

In [35]:
unis_qs = {uni.name for uni in ranking_qs}
unis_the = {uni.name for uni in ranking_the}
unis_arwu = {uni.name for uni in ranking_arwu}

unis_all = unis_qs | unis_the | unis_arwu

with open("data/math-2022/universities_sorted.tsv", "w") as file:
    for uni in sorted(unis_all):
        file.write(f"{uni}\n")
    file.write(f"Length: {len(unis_all)}")

## Combine Rankings

In [36]:
rankings = [ranking_qs, ranking_the, ranking_arwu]

uni_ranks = defaultdict(list)
for ranking in rankings:
    for uni in ranking:
        uni_ranks[uni.name].append(uni.rank)

uni_ranks = dict(uni_ranks)

In [37]:
# Fill missing ranks
for _, ranks in uni_ranks.items():
    while len(ranks) < 3:
            ranks.append(101) # Best possible rank when not in top-100

In [38]:
# Average ranks
ranking_all = [Uni(name, mean(ranks)) for name, ranks in uni_ranks.items()]

## Save Final Ranking

In [39]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file and preview it."""
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    with open("data/math-2022/ranking_global_math_2022.tsv", "w") as file:
        for i, uni in enumerate(ranking_sorted, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}"
            file.write(f"{row}\n")
            print(row)  # Preview
            if i == limit:
                break

save_ranking(ranking_all, 10)

1	Princeton University	3.67
2	Massachusetts Institute of Technology (MIT)	4.00
3	Stanford University	4.00
4	University of Cambridge	5.00
5	University of Oxford	6.00
6	University of California, Berkeley	6.33
7	Harvard University	8.33
8	ETH Zurich	8.67
9	University of California, Los Angeles (UCLA)	11.67
10	Imperial College London	13.67


## Get IDs (Helper)

In [40]:
def get_ids(ranking: list[Uni], limit: int = None) -> list[int]:
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    ranking_sorted_ids = []
    unis_without_id = []
    for uni in ranking_sorted[:limit]:
        if uni.name in main2id:
            ranking_sorted_ids.append(main2id[uni.name])
        else:
            unis_without_id.append(uni.name)
    # print("Ranked unis ids:", ranking_sorted_ids)
    print("Unis without id:", unis_without_id)
    return ranking_sorted_ids

top_60_math = get_ids(ranking_all, 60)
print("$top60math = ", top_60_math)

Unis without id: ['University of Bonn', 'Humboldt University of Berlin']
$top60math =  [6, 39, 1, 978, 1092, 3, 38, 288, 692, 1043, 888, 119, 33, 11, 8, 159, 2, 86, 483, 355, 72, 16, 99, 12, 168, 56, 4, 17, 14, 69, 322, 15, 794, 384, 266, 225, 49, 62, 187, 74, 29, 22, 1248, 170, 5, 40, 976, 930, 268, 292, 31, 405, 7, 9, 24, 227, 238, 48]
