# World Top Universities 2023

In this Jupyter Notebook, we're going to combine the rankings we scraped into one.

## Define Settings

In [1]:
year = 2023
subject = "cs"

## Import Modules

In [2]:
import json

from collections import defaultdict
from dataclasses import dataclass
from itertools import islice
from normalize import alt2main, uni2id
from statistics import mean

## Define Data Model

In [3]:
@dataclass
class Uni:
    """Represent a university."""
    name: str
    rank: int

## Load Scraped Rankings

In [4]:
def load_ranking(path: str, limit: int = None) -> None:
    """Load ranking."""
    with open(path) as file:
        return [Uni(**json.loads(line)) for line in islice(file, limit)]

base = "./scraper/data_scraped"
qs = load_ranking(f"{base}/qs.jsonl", 100)
the = load_ranking(f"{base}/the.jsonl", 100)
arwu = load_ranking(f"{base}/arwu.jsonl", 100)

## Preview Scraped Rankings

In [5]:
def preview_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Preview ranking."""
    for uni in islice(ranking, limit):
        print(f"{uni.name}\t{uni.rank}")
    print("...")
    for uni in islice(ranking, len(ranking) - (limit if limit else 0), len(ranking)):
        print(f"{uni.name}\t{uni.rank}")

In [6]:
preview_ranking(qs, 10)

Massachusetts Institute of Technology (MIT)	1
Carnegie Mellon University	2
Stanford University	3
University of California, Berkeley (UCB)	4
University of Oxford	5
National University of Singapore (NUS)	6
University of Cambridge	7
Harvard University	8
ETH Zurich	9
EPFL	10
...
Ludwig-Maximilians-Universität München	91
Trinity College Dublin, The University of Dublin	92
University of Wisconsin-Madison	93
Indian Institute of Technology Kharagpur (IIT-KGP)	94
Huazhong University of Science and Technology	95
Indian Institute of Technology Kanpur (IITK)	96
University of California, Irvine	97
Boston University	98
IMT Atlantique	99
Xi’an Jiaotong University	100


In [7]:
preview_ranking(the, 10)

University of Oxford	1
Massachusetts Institute of Technology	2
Stanford University	3
ETH Zurich	4
Carnegie Mellon University	5
University of Cambridge	6
National University of Singapore	7
University of California, Berkeley	8
Harvard University	9
Technical University of Munich	10
...
University of California, Davis	91
Purdue University West Lafayette	91
TU Wien	93
University of North Carolina at Chapel Hill	94
Qatar University	95
Virginia Polytechnic Institute and State University	95
University of Warsaw	95
Northwestern University	98
University of Bonn	99
Lomonosov Moscow State University	100


In [8]:
preview_ranking(arwu)

Massachusetts Institute of Technology (MIT)	1
Stanford University	2
University of California, Berkeley	3
Carnegie Mellon University	4
Tsinghua University	5
Harvard University	6
ETH Zurich	7
Nanyang Technological University	8
University of Oxford	9
Princeton University	10
Columbia University	11
University of Electronic Science and Technology of China	12
National University of Singapore	13
University of California, Los Angeles	14
University of Toronto	15
Shanghai Jiao Tong University	16
University of Technology Sydney	17
Zhejiang University	18
Peking University	19
The Chinese University of Hong Kong	20
Huazhong University of Science and Technology	21
Cornell University	22
Beihang University	23
Harbin Institute of Technology	24
University of Southern California	25
Georgia Institute of Technology	26
Xidian University	26
New York University	28
University of Sydney	29
University of Oslo	30
The University of Edinburgh	31
University of Science and Technology of China	32
University of Montreal	

## Clean Rankings

In [9]:
def normalize(ranking: list[Uni], normalizer: dict[str, str]):
    """Normalize university names."""
    for uni in ranking:
        uni.name = normalizer.get(uni.name, uni.name)

normalize(qs, alt2main)        
normalize(the, alt2main)
normalize(arwu, alt2main)

{u.name for u in qs} | {u.name for u in the} | {u.name for u in arwu}

{'Aalto University',
 'Arizona State University',
 'Australian National University',
 'Beihang University',
 'Beijing Institute of Technology',
 'Beijing University of Posts and Telecommunications',
 'Boston University',
 'Brown University',
 'California Institute of Technology (Caltech)',
 'Carnegie Mellon University (CMU)',
 'Central South University',
 'Chinese University of Hong Kong (CUHK)',
 'City University of Hong Kong',
 'Columbia University',
 'Cornell University',
 'Dalian University of Technology',
 'Deakin University',
 'Delft University of Technology',
 'Duke University',
 'ETH Zurich',
 'Eindhoven University of Technology',
 'Fudan University',
 'Georgia Institute of Technology (Georgia Tech)',
 'Griffith University',
 'Guangdong University of Technology',
 'Harbin Institute of Technology',
 'Harvard University',
 'Hong Kong Polytechnic University',
 'Huazhong University of Science and Technology',
 'IMT Atlantique',
 'ITMO University',
 'Imperial College London',
 'Indi

## Combine Rankings

In [10]:
uni2ranks = defaultdict(list)
for ranking in [qs, the, arwu]:
    for uni in ranking:
        uni2ranks[uni.name].append(uni.rank)

uni2ranks = dict(uni2ranks)

In [11]:
for _, ranks in uni2ranks.items():
    while len(ranks) < 3:
        ranks.append(101)  # Best possible rank when not in top 100
uni2ranks["University of Alberta"]

[51, 101, 101]

In [12]:
combined_ranking = [Uni(uni, mean(ranks)) for uni, ranks in uni2ranks.items()]
combined_ranking.sort(key=lambda uni: uni.rank)  # Smaller to larger rank

## Save Combined Ranking

In [15]:
def save_ranking(ranking: list[Uni], limit: int = None) -> None:
    """Save ranking to file."""
    with open(f"combined_ranking_{subject}_{year}.tsv", "w") as file:
        for i, uni in enumerate(ranking, start=1):
            row = f"{i}\t{uni.name}\t{uni.rank:.2f}\n"
            print(row)  # Preview
            file.write(row)            
            if i == limit:
                break

save_ranking(combined_ranking, 10)

1	Massachusetts Institute of Technology (MIT)	1.33

2	Stanford University	2.67

3	Carnegie Mellon University (CMU)	3.67

4	University of California, Berkeley	5.00

5	University of Oxford	5.00

6	ETH Zurich	6.67

7	Harvard University	7.67

8	National University of Singapore (NUS)	8.67

9	Nanyang Technological University (NTU)	11.00

10	Princeton University	11.33



## Get IDs

In [14]:
def get_ids(unis: list[str], uni2id: dict[str, str], limit: int = None) -> list[int]:
    """Return IDs corresponding to universities."""
    ids = []
    no_id = []
    for uni in unis:
        if uni in uni2id:
            ids.append(uni2id[uni])
        else:
            no_id.append(uni)
        if len(ids) == limit:
            return ids, no_id

ids, no_id = get_ids([uni.name for uni in combined_ranking], uni2id=uni2id, limit=60)

print(f"Unis without id: {no_id}")
print()
print(f"$top_unis_cs_2023 ={ids}")

Unis without id: ['University of Southern California']

$top_unis_cs_2023 =[39, 1, 40, 3, 1092, 288, 38, 72, 170, 6, 168, 8, 225, 33, 692, 159, 978, 1043, 15, 238, 12, 14, 483, 16, 384, 74, 17, 2027, 1458, 2444, 31, 4, 49, 2, 399, 977, 9, 5, 22, 11, 918, 794, 813, 24, 86, 99, 2244, 233, 327, 888, 930, 268, 48, 119, 69, 632, 405, 117, 98, 350]
