# World Top CS Universities [2021]

## Import Modules

In [41]:
import csv
import os

from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from statistics import mean

from normalize import alt2main, main2id
from ranking_scraper.ranking_scraper.settings import ranking as r

## Define Model

In [42]:
@dataclass
class Uni:
    """Represent university."""
    name: str
    rank: int

## Load Rankings

In [43]:
def format(rank: str) -> int:
    """Format rank as an integer."""
    if rank.startswith("="):
        return int(rank[1:])
    if rank.endswith("+"):
        return int(rank[:-1])
    if "–" in rank:
        return mean([int(val) for val in rank.split("–")])
    if "-" in rank:
        return mean([int(val) for val in rank.split("-")])
    return int(rank)

def normalize(name: str) -> str:
    """Normalize university names."""
    if name.startswith("The "):
        name = name[4:]
    if name in alt2main:
        return alt2main[name]
    return name

def load_ranking(path: str, delimiter: str = None, limit: int = None) -> list[Uni]:
    """Load ranking."""
    with open(path) as file:
        reader = csv.reader(file, delimiter=delimiter)
        next(reader)  # Skip header row
        return [Uni(normalize(name), format(rank)) for name, rank in islice(reader, limit)]

# Load universities
base = "ranking_scraper/ranking_scraper"
ranking_the = load_ranking(f"{base}/data/input/{r['subject']}_{r['year']}/the.csv", ",", 100)
ranking_qs = load_ranking(f"{base}/data/input/{r['subject']}_{r['year']}/qs.csv", ",", 100)
ranking_arwu = load_ranking(f"{base}/data/input/{r['subject']}_{r['year']}/arwu.csv", ",", 100)

## Preview Rankings

In [44]:
def preview_ranking(ranking: list[Uni], limit: int = 10) -> None:
    """Preview ranking."""
    # Print first universities in ranking
    for i in range(limit):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    # Print last universities in ranking
    for i in range(len(ranking) - limit, len(ranking)):
        uni = ranking[i]
        print(f"{uni.name}\t{uni.rank}")
    print(
        f"-----------------------------\n"
        f"Length: {len(ranking)}"
    )

In [45]:
preview_ranking(ranking_the)

Massachusetts Institute of Technology (MIT)	1
Stanford University	1
University of Cambridge	3
University of Oxford	4
Harvard University	5
University of California, Berkeley	6
University of Chicago	7
Yale University	8
London School of Economics and Political Science (LSE)	9
Duke University	10
...
University of Twente	92
Vrije Universiteit Amsterdam	92
City University of Hong Kong	94
Sant’Anna School of Advanced Studies – Pisa	95
University of Illinois at Urbana-Champaign	96
Georgia Institute of Technology (Georgia Tech)	97
University of Maryland, College Park	98
University of Nottingham	99
University of Glasgow	100
University of Alberta	113
-----------------------------
Length: 100


In [46]:
preview_ranking(ranking_qs)

Harvard University	1
INSEAD	2
London Business School	3
Massachusetts Institute of Technology (MIT)	4
Stanford University	5
University of Pennsylvania	6
Bocconi University	7
University of Cambridge	8
HEC Paris School of Management	9
University of Oxford	10
...
University of Southern California	90
National Taiwan University (NTU)	92
Paris Sciences et Lettres (PSL University)	92
University of Auckland	94
University College London (UCL)	94
University of Strathclyde	96
Zhejiang University	96
Universidad de Chile	98
Aston University	99
Technical University of Munich	99
-----------------------------
Length: 100


In [47]:
preview_ranking(ranking_arwu)

Erasmus University Rotterdam	1
Texas A&M University	2
University of Pennsylvania	3
Harvard University	4
University of Michigan-Ann Arbor	5
Duke University	6
Georgia State University	7
Indiana University	8
INSEAD	9
University of North Carolina at Chapel Hill	10
...
University of Texas at San Antonio	88
Universitat Ramon Llull	88
University of California, Berkeley	88
University of California, Los Angeles (UCLA)	88
University of Houston	88
University of Kentucky	88
University of Tennessee - Knoxville	88
University of Virginia	88
Vrije Universiteit Amsterdam	88
Washington University in St. Louis	88
-----------------------------
Length: 100


## Clean Rankings

Sometimes universities have different names in different rankings. For example, one ranking might use universities' full names while another might prefer acronyms. This might cause duplicates down the line.

To avoid this issue, let's save all university names sorted alphabetically and try to spot duplicates. They typically appear next to each other, since they differ only slightly. Once we spot a duplicate, we pick the preferred name and add it to `normalize.py` as well as its alt-names, and we repeat this process until no duplicates are left.

In [48]:
unis_qs = {uni.name for uni in ranking_qs}
unis_the = {uni.name for uni in ranking_the}
unis_arwu = {uni.name for uni in ranking_arwu}

unis_all = unis_qs | unis_the | unis_arwu

with open(f"{base}/data/output/universities_sorted.tsv", "w") as file:
    for uni in sorted(unis_all):
        file.write(f"{uni}\n")
    file.write(f"Length: {len(unis_all)}")

## Combine Rankings

In [49]:
rankings = [ranking_qs, ranking_the, ranking_arwu]

uni_ranks = defaultdict(list)
for ranking in rankings:
    for uni in ranking:
        uni_ranks[uni.name].append(uni.rank)

uni_ranks = dict(uni_ranks)

In [50]:
# Fill missing ranks
for _, ranks in uni_ranks.items():
    while len(ranks) < 3:
            ranks.append(101) # Best possible rank when not in top-100

In [51]:
# Average ranks
ranking_all = [Uni(name, mean(ranks)) for name, ranks in uni_ranks.items()]

## Save Final Ranking

In [52]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file and preview it."""
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    filepath = f"{base}/data/output/{r['subject']}_{r['year']}_combined.tsv"
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w") as file:
        for i, uni in enumerate(ranking_sorted, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}"
            file.write(f"{row}\n")
            print(row)  # Preview
            if i == limit:
                break

save_ranking(ranking_all, 60)

1	Harvard University	3.33
2	University of Pennsylvania	7.00
3	Stanford University	13.67
4	Erasmus University Rotterdam	14.33
5	Columbia University	14.67
6	University of Michigan-Ann Arbor	16.67
7	Northwestern University	18.00
8	Duke University	21.67
9	New York University (NYU)	22.00
10	University of Cambridge	24.67
11	University of Oxford	25.67
12	National University of Singapore (NUS)	30.00
13	University of Chicago	31.00
14	Hong Kong University of Science and Technology	33.67
15	Massachusetts Institute of Technology (MIT)	35.33
16	University of Toronto	35.33
17	University of California, Berkeley	35.67
18	Cornell University	36.33
19	University of British Columbia	36.67
20	University of Warwick	37.00
21	INSEAD	37.33
22	Tilburg University	39.00
23	London School of Economics and Political Science (LSE)	40.33
24	Hong Kong Polytechnic University	42.00
25	Yale University	42.33
26	University of Texas at Austin	42.33
27	Copenhagen Business School	44.00
28	Nanyang Technological University (NTU)

## Get IDs (Helper)

In [54]:
def get_ids(ranking: list[Uni], limit: int = None) -> list[int]:
    ranking_sorted = [uni for uni in sorted(ranking_all, key=lambda uni: uni.rank)]
    ranking_sorted_ids = []
    unis_without_id = []
    for uni in ranking_sorted[:limit]:
        if uni.name in main2id:
            ranking_sorted_ids.append(main2id[uni.name])
        else:
            unis_without_id.append(uni.name)
    # print("Ranked unis ids:", ranking_sorted_ids)
    print("Unis without id:", unis_without_id)
    return ranking_sorted_ids

top_unis_ids = get_ids(ranking_all, 60)[:50]
print(len(top_unis_ids))
print(f"$top_unis_{r['subject']}_{r['year']} = ", top_unis_ids)

Unis without id: ['Tilburg University', 'London School of Economics and Political Science (LSE)', 'University of St. Gallen', 'University of Southern California', 'City, University of London', 'Singapore Management University']
50
$top_unis_business_2022 =  [38, 5, 1, 849, 33, 2, 62, 7, 483, 978, 1092, 72, 119, 31, 39, 8, 3, 225, 4, 187, 1468, 663, 99, 86, 237, 170, 24, 109, 229, 692, 405, 168, 159, 227, 2108, 50, 164, 350, 15, 717, 179, 55, 48, 100, 70, 348, 1555, 238, 233, 2239]
