# World Top Universities 2023

In this Jupyter Notebook, we're going to combine the rankings we scraped into one.

## Define Settings

In [1]:
year = 2023
subject = "math"

## Import Modules

In [2]:
import json

from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from normalize import alt2main, uni2id
from statistics import mean

## Define Data Model

In [3]:
@dataclass
class Uni:
    """Represent a university."""
    name: str
    rank: int

## Load Scraped Rankings

In [4]:
def load_ranking(path: str, limit: int = None) -> None:
    """Load ranking."""
    with open(path) as file:
        return [Uni(**json.loads(line)) for line in islice(file, limit)]

base = "./scraper/data_scraped"
qs = load_ranking(f"{base}/qs_{subject}_{year}.jsonl", 100)
the = load_ranking(f"{base}/the_{subject}_{year}.jsonl", 100)
arwu = load_ranking(f"{base}/arwu_{subject}_{year}.jsonl", 100)

## Preview Scraped Rankings

In [5]:
def preview_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Preview ranking."""
    for uni in islice(ranking, limit):
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    for uni in islice(ranking, len(ranking) - (limit if limit else 0), len(ranking)):
        print(f"{uni.name}\t{uni.rank}")

In [6]:
preview_ranking(qs, 10)

Massachusetts Institute of Technology (MIT)	1
University of Cambridge	2
Stanford University	3
University of Oxford	4
Harvard University	5
University of California, Berkeley (UCB)	6
Princeton University	7
ETH Zurich	8
New York University (NYU)	9
Imperial College London	10
...
Pohang University of Science And Technology (POSTECH)	91
Indian Institute of Technology Bombay (IITB)	92
The London School of Economics and Political Science (LSE)	93
University of Amsterdam	94
KU Leuven	95
Universidad Nacional Autónoma de México  (UNAM)	96
Universitat Autònoma de Barcelona	97
Indian Institute of Technology Madras (IITM)	98
Politecnico di Torino	99
The Ohio State University	100


In [7]:
preview_ranking(the, 10)

Princeton University	1
University of California, Berkeley	2
Massachusetts Institute of Technology	2
California Institute of Technology	4
Stanford University	5
Harvard University	6
University of Cambridge	7
University of Oxford	8
ETH Zurich	9
Columbia University	10
...
King’s College London	91
KU Leuven	91
RWTH Aachen University	91
University of Adelaide	94
University of Bern	95
Lund University	96
University of Leeds	97
University of California, Santa Cruz	98
UNSW Sydney	98
Karlsruhe Institute of Technology	100


In [8]:
preview_ranking(arwu)

Paris-Saclay University	1
Princeton University	2
Sorbonne University	3
University of Cambridge	4
University of Oxford	5
Massachusetts Institute of Technology (MIT)	6
Stanford University	7
ETH Zurich	8
New York University	9
The University of Texas at Austin	10
University of California, Los Angeles	11
PSL University	12
University of Wisconsin - Madison	13
University of California, Berkeley	14
University of Bonn	15
Stony Brook University	16
The Hebrew University of Jerusalem	17
University of Chicago	18
Rutgers, The State University of New Jersey - New Brunswick	19
University of Warwick	20
Harvard University	21
University of Toronto	22
Imperial College London	23
University of Paris	24
Kyoto University	25
Columbia University	26
The University of Edinburgh	27
University of Illinois at Urbana-Champaign	28
University of Vienna	29
University of Michigan-Ann Arbor	30
University of California, San Diego	31
KTH Royal Institute of Technology	32
Georgia Institute of Technology	33
Ecole Normale Super

## Clean Rankings

In [9]:
def normalize(ranking: list[Uni], normalizer: dict[str, str]):
    """Normalize university names."""
    for uni in ranking:
        uni.name = normalizer.get(uni.name, uni.name)

normalize(qs, alt2main)        
normalize(the, alt2main)
normalize(arwu, alt2main)

{u.name for u in qs} | {u.name for u in the} | {u.name for u in arwu}

{'Aix Marseille University',
 'Australian National University',
 'Bielefeld University',
 'Brown University',
 'California Institute of Technology (Caltech)',
 'Carnegie Mellon University (CMU)',
 'China Medical University (Taichung)',
 'Chinese University of Hong Kong (CUHK)',
 'City University of Hong Kong',
 'Claude Bernard University Lyon 1',
 'Columbia University',
 'Cornell University',
 'Delft University of Technology',
 'Duke University',
 'ETH Zurich',
 'Ecole Centrale Lyon',
 'Eindhoven University of Technology',
 'Free University of Berlin',
 'Fudan University',
 'Georgia Institute of Technology (Georgia Tech)',
 'HSE University',
 'Harvard University',
 'Hebrew University of Jerusalem',
 'Hong Kong University of Science and Technology',
 'Humboldt University of Berlin',
 'INSA Lyon',
 'Imperial College London',
 'Indian Institute of Technology Bombay (IITB)',
 'Indian Institute of Technology Madras (IITM)',
 'Indiana University',
 'Institut Polytechnique de Paris',
 'Intern

## Combine Rankings

In [10]:
uni2ranks = defaultdict(list)
for ranking in [qs, the, arwu]:
    for uni in ranking:
        uni2ranks[uni.name].append(uni.rank)

uni2ranks = dict(uni2ranks)

In [11]:
for _, ranks in uni2ranks.items():
    while len(ranks) < 3:
        ranks.append(101)  # Best possible rank when not in top 100

# Check a few examples against the rankings
uni2ranks["Massachusetts Institute of Technology (MIT)"]
uni2ranks["Georgia Institute of Technology (Georgia Tech)"]
uni2ranks["University of Melbourne"]

[44, 68, 101]

In [12]:
combined_ranking = [Uni(uni, mean(ranks)) for uni, ranks in uni2ranks.items()]
combined_ranking.sort(key=lambda uni: uni.rank)  # Smaller to larger rank

## Save Combined Ranking

In [13]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file."""
    with open(f"combined_ranking_{subject}_{year}.tsv", "w") as file:
        for i, uni in enumerate(ranking, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}\n"
            print(row)  # Preview
            file.write(row)            
            if i == limit:
                break

save_ranking(combined_ranking, 60)

1	Massachusetts Institute of Technology (MIT)	3.00

2	Princeton University	3.33

3	University of Cambridge	4.33

4	Stanford University	5.00

5	University of Oxford	5.67

6	University of California, Berkeley	7.33

7	ETH Zurich	8.33

8	Harvard University	10.67

9	University of California, Los Angeles (UCLA)	12.67

10	Imperial College London	15.33

11	Paris Sciences et Lettres (PSL University)	15.67

12	University of Chicago	16.67

13	Paris-Saclay University	19.33

14	Columbia University	19.33

15	New York University (NYU)	20.33

16	Sorbonne University	21.00

17	California Institute of Technology (Caltech)	22.33

18	University of Toronto	22.33

19	National University of Singapore (NUS)	26.00

20	University of Michigan-Ann Arbor	26.00

21	University of Texas at Austin	26.00

22	Institut Polytechnique de Paris	26.33

23	Yale University	26.67

24	Swiss Federal Institute of Technology Lausanne (EPFL)	27.67

25	Peking University	28.33

26	University of Bonn	31.33

27	Tsinghua University	31.67


## Get IDs

In [14]:
def get_ids(unis: list[str], uni2id: dict[str, str], limit: int = None) -> list[int]:
    """Return IDs corresponding to universities."""
    ids = []
    no_id = []
    for uni in unis:
        if uni in uni2id:
            ids.append(uni2id[uni])
        else:
            no_id.append(uni)
        if len(ids) == limit:
            return ids, no_id

ids, no_id = get_ids([uni.name for uni in combined_ranking], uni2id=uni2id, limit=60)

print(f"Unis without id: {no_id}")
print()
print(f"$top_unis_{subject}_{year} = {ids}")

Unis without id: ['University of Bonn', 'Humboldt University of Berlin', 'Université Paris Cité']

$top_unis_math_2023 = [39, 6, 978, 1, 1092, 3, 288, 38, 692, 1043, 888, 119, 794, 33, 483, 355, 11, 8, 72, 2, 86, 2244, 99, 16, 159, 168, 14, 225, 12, 69, 74, 4, 17, 49, 15, 322, 918, 384, 29, 187, 56, 266, 170, 62, 5, 22, 40, 976, 268, 238, 1555, 9, 930, 292, 227, 841, 7, 24, 23, 557]
