In [1]:
import requests
import pandas as pd
import os
import numpy as np
import re

from dotenv import load_dotenv
import os

from json import JSONDecodeError

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
API_KEY = os.getenv('API_KEY')

Reference: https://support.qs.com/hc/en-gb/articles/4410488025106-QS-World-University-Rankings-by-Subject
### The rankings columns are:
* Academic Reputation (30% weight)
-- The Academic Reputation (AR) indicator measures the reputation of institutions and their programmes by asking academic experts to nominate universities based on their subject area of expertise. Pioneered by QS in 2004, it asks the question: which universities are demonstrating academic excellence? To answer this we collect and distil the collective intelligence of academics from around the world via our Academic Survey, evaluating nominations for approximately 7000 institutions each year.The indicator not only illuminates the quality of an institution's research, but also their approach to academic partnerships, their strategic impact, their educational innovativeness and the impact they have made on education and society at large.
The indicator is the centrepiece of almost all of the rankings across the QS portfolio. 

* Employer Reputation (15% weight)
-- The Employer Reputation (ER) indicator measures the reputation of institutions and their programmes among employers. We remain the only major ranking to focus on this vital aspect of a student's educational journey.

* Citations per Paper
-- The Citations per Paper (CPP) indicator measures the impact and quality of the scientific work done by institutions, on average per publication.

* H-Index
-- The h-index is an index that attempts to measure both the productivity and impact of the published work of a scientist or scholar. The index is based on the set of the scientist’s most cited papers and the number of citations that they have received in other publications. It can also be applied to the productivity and impact of a group of scientists, such as a department, or an institution (as in the case of our indicator), or a country, as well as a scholarly journal. The index is defined as the maximum value of h such that the given entity (author, journal, department, institution, etc.) has published at least h papers that have each been cited at least h times (https://doi.org/10.1073/pnas.0507655102). We use institution-level H Index.

* International Research Network
-- International Research Network (IRN) is a measure of an institution's success in creating and sustaining research partnerships with institutions in other locations. The indicator measures how diverse and rich an institution's research network is by looking at the number of different countries represented, and whether these relationships are renewed and repeated. We only consider sustained partnerships, defined as those which result in three or more joint papers published in a five-year period.


In [12]:
def get_top_100_us_institutions_for_life_sciences():
    file_path = '2025_QS_rankings.xlsx'
    # Reload with correct settings: skip to row 10 (0-based), treat row 10 as header
    df_qs = pd.read_excel(file_path, sheet_name="Life Sciences & Medicine", skiprows=10, header=0)

    # Drop rows with missing Institution (bottom padding, if any)
    df_qs = df_qs.dropna(subset=["Institution"])

    # Select top 100 programs
    df_top100 = df_qs.head(100)

    # Select relevant columns
    df_top100 = df_top100[[
        "2025", "Institution", "Country / Territory", "Score", "Academic", "Employer", "Citations", "H", "IRN"
    ]]

    # Rename 2025 column to "Rank" for clarity
    df_top100 = df_top100.rename(columns={"2025": "Rank"})
    # Clean up the rank column that has = sign prefixed and make it an integer
    df_top100['Rank'] = df_top100['Rank'].str.replace('=', '')
    df_top100['Rank'] = df_top100['Rank'].astype(int)

    # consider only US instituions and rank by H-index (higher the H-index, better the institution) and drop the country column
    df_top100_us = df_top100[df_top100['Country / Territory'] == 'United States of America']
    df_top100_us = df_top100_us.drop(columns=['Country / Territory'])

    # clean up the institution names
    list_of_institutions = df_top100_us['Institution'].values.tolist()
    cleaned_institutions = []
    for institution in list_of_institutions:
        cleaned = re.sub(r'\([^)]*\)', '', institution)
        cleaned = cleaned.replace(',', '-')
        cleaned = cleaned.strip()
        cleaned_institutions.append(cleaned)

    # no space before and after '-' within the string if there are space
    cleaned_institutions = [re.sub(r'\s*-\s*', '-', institution) for institution in cleaned_institutions]

    # remove any duplicates
    cleaned_institutions = list(set(cleaned_institutions))

    # remove any empty strings
    cleaned_institutions = [institution for institution in cleaned_institutions if institution]


    return cleaned_institutions, df_top100_us

In [None]:
def test_top_100_us_institutions_for_life_sciences():
    top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()
    print(top_institutions)

test_top_100_us_institutions_for_life_sciences()

In [14]:
def get_school_admission_data(school_name):
    print(f"Getting admission data for {school_name}")
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    params = {
        "api_key": API_KEY,
        "school.name": school_name,
        "fields": "school.name,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black",
        "per_page": 1
    }

    response = requests.get(base_url, params=params)
    try:
        data = response.json()
        # Convert to pandas dataframe
        df = pd.json_normalize(data['results'])
    except JSONDecodeError as e:
        print(f"Error getting admission data for {school_name}: {e}")
        return pd.DataFrame()

    # if df is all NAN, return empty dataframe
    if df.isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    # check if everything is NaN except the school name column and if so, return empty dataframe 
    # get all columns except the school name column
    columns_to_check = df.columns.tolist()
    columns_to_check.remove('school.name')
    if df[columns_to_check].isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    return df

In [15]:
top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()

aggregated_df = pd.DataFrame()
for institution in top_institutions:
    df_admission_data = get_school_admission_data(institution)
    if not df_admission_data.empty:
        # Initialize aggregated_df with the first non-empty dataframe
        if aggregated_df.empty:
            aggregated_df = df_admission_data
        else:
            if not df_admission_data.isna().all().all():
                aggregated_df = pd.concat([aggregated_df, df_admission_data], ignore_index=True)

aggregated_df

Getting admission data for Cornell University
No admission data found for Cornell University
Getting admission data for University of Wisconsin-Madison
Getting admission data for Stanford University
Getting admission data for University of Florida
Getting admission data for University of Pittsburgh
Getting admission data for The Ohio State University
No admission data found for The Ohio State University
Getting admission data for University of California-Davis
Getting admission data for Duke University
Getting admission data for University of California-San Diego
Getting admission data for Baylor College of Medicine
No admission data found for Baylor College of Medicine
Getting admission data for Washington University in St. Louis
Getting admission data for University of Michigan-Ann Arbor
Getting admission data for New York University
Getting admission data for The University of Texas M. D. Anderson Cancer Center
No admission data found for The University of Texas M. D. Anderson Cance

Unnamed: 0,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black,school.name
0,0.4335,0.1051,0.6035,0.0831,0.025,University of Wisconsin-Madison
1,0.0391,0.2747,0.2416,0.1766,0.0791,Stanford University
2,0.2538,0.0373,0.5821,0.2318,0.0739,Florida State University
3,0.497,0.1489,0.6104,0.0675,0.0546,University of Pittsburgh-Pittsburgh Campus
4,0.4163,0.3134,0.2054,0.2457,0.0174,University of California-Davis
5,0.0678,0.2317,0.3611,0.1077,0.0849,Duke University
6,0.2452,0.3432,0.1818,0.2526,0.0179,University of California-San Diego
7,0.1196,0.2115,0.4147,0.1256,0.0891,Washington University in St Louis
8,0.1794,0.1863,0.4884,0.1003,0.0454,University of Michigan-Ann Arbor
9,0.587,0.0443,0.586,0.23,0.0734,State University of New York at New Paltz
