In [1]:
import requests
import pandas as pd
import os
import numpy as np
import re

from dotenv import load_dotenv
import os

from json import JSONDecodeError

# Now create an interactive Plotly bar chart
import plotly.graph_objects as go

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
API_KEY = os.getenv('API_KEY')

Reference: https://support.qs.com/hc/en-gb/articles/4410488025106-QS-World-University-Rankings-by-Subject
### The rankings columns are:
* Academic Reputation (30% weight)
-- The Academic Reputation (AR) indicator measures the reputation of institutions and their programmes by asking academic experts to nominate universities based on their subject area of expertise. Pioneered by QS in 2004, it asks the question: which universities are demonstrating academic excellence? To answer this we collect and distil the collective intelligence of academics from around the world via our Academic Survey, evaluating nominations for approximately 7000 institutions each year.The indicator not only illuminates the quality of an institution's research, but also their approach to academic partnerships, their strategic impact, their educational innovativeness and the impact they have made on education and society at large.
The indicator is the centrepiece of almost all of the rankings across the QS portfolio. 

* Employer Reputation (15% weight)
-- The Employer Reputation (ER) indicator measures the reputation of institutions and their programmes among employers. We remain the only major ranking to focus on this vital aspect of a student's educational journey.

* Citations per Paper
-- The Citations per Paper (CPP) indicator measures the impact and quality of the scientific work done by institutions, on average per publication.

* H-Index
-- The h-index is an index that attempts to measure both the productivity and impact of the published work of a scientist or scholar. The index is based on the set of the scientist’s most cited papers and the number of citations that they have received in other publications. It can also be applied to the productivity and impact of a group of scientists, such as a department, or an institution (as in the case of our indicator), or a country, as well as a scholarly journal. The index is defined as the maximum value of h such that the given entity (author, journal, department, institution, etc.) has published at least h papers that have each been cited at least h times (https://doi.org/10.1073/pnas.0507655102). We use institution-level H Index.

* International Research Network
-- International Research Network (IRN) is a measure of an institution's success in creating and sustaining research partnerships with institutions in other locations. The indicator measures how diverse and rich an institution's research network is by looking at the number of different countries represented, and whether these relationships are renewed and repeated. We only consider sustained partnerships, defined as those which result in three or more joint papers published in a five-year period.


In [3]:
def get_top_100_us_institutions_for_life_sciences():
    file_path = '2025_QS_rankings.xlsx'
    # Reload with correct settings: skip to row 10 (0-based), treat row 10 as header
    df_qs = pd.read_excel(file_path, sheet_name="Life Sciences & Medicine", skiprows=10, header=0)

    # Drop rows with missing Institution (bottom padding, if any)
    df_qs = df_qs.dropna(subset=["Institution"])

    # Select top 100 programs
    df_top100 = df_qs.head(100)

    # Select relevant columns
    df_top100 = df_top100[[
        "2025", "Institution", "Country / Territory", "Score", "Academic", "Employer", "Citations", "H", "IRN"
    ]]

    # Rename 2025 column to "Rank" for clarity
    df_top100 = df_top100.rename(columns={"2025": "Rank"})
    # Clean up the rank column that has = sign prefixed and make it an integer
    df_top100['Rank'] = df_top100['Rank'].str.replace('=', '')
    df_top100['Rank'] = df_top100['Rank'].astype(int)

    # consider only US instituions and rank by H-index (higher the H-index, better the institution) and drop the country column
    df_top100_us = df_top100[df_top100['Country / Territory'] == 'United States of America']
    df_top100_us = df_top100_us.drop(columns=['Country / Territory'])

    # clean up the institution names
    list_of_institutions = df_top100_us['Institution'].values.tolist()
    cleaned_institutions = []
    for institution in list_of_institutions:
        cleaned = re.sub(r'\([^)]*\)', '', institution)
        cleaned = cleaned.replace(',', '-')
        cleaned = cleaned.strip()
        cleaned_institutions.append(cleaned)

    # no space before and after '-' within the string if there are space
    cleaned_institutions = [re.sub(r'\s*-\s*', '-', institution) for institution in cleaned_institutions]

    # remove any duplicates
    cleaned_institutions = list(set(cleaned_institutions))

    # remove any empty strings
    cleaned_institutions = [institution for institution in cleaned_institutions if institution]


    return cleaned_institutions, df_top100_us

In [4]:
def test_top_100_us_institutions_for_life_sciences():
    top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()
    print(top_institutions)

test_top_100_us_institutions_for_life_sciences()

['Stanford University', 'Washington University in St. Louis', 'University of Wisconsin-Madison', 'University of Michigan-Ann Arbor', 'University of California-San Francisco', 'University of North Carolina-Chapel Hill', 'Massachusetts Institute of Technology', 'Johns Hopkins University', 'University of Pittsburgh', 'University of California-Los Angeles', 'Boston University', 'University of California-San Diego', 'Yale University', 'Cornell University', 'Columbia University', 'University of Minnesota Twin Cities', 'University of California-Davis', 'New York University', 'University of Washington', 'Duke University', 'Emory University', 'University of California-Berkeley', 'The University of Texas M. D. Anderson Cancer Center', 'The Ohio State University', 'University of Chicago', 'Northwestern University', 'Vanderbilt University', 'Baylor College of Medicine', 'Harvard University', 'University of Pennsylvania', 'University of Florida']


In [5]:
def get_school_admission_data(school_name):
    print(f"Getting admission data for {school_name}")
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    params = {
        "api_key": API_KEY,
        "school.name": school_name,
        "fields": "school.name,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black",
        "per_page": 1
    }

    response = requests.get(base_url, params=params)
    try:
        data = response.json()
        # Convert to pandas dataframe
        df = pd.json_normalize(data['results'])
    except JSONDecodeError as e:
        print(f"Error getting admission data for {school_name}: {e}")
        return pd.DataFrame()

    # if df is all NAN, return empty dataframe
    if df.isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    # check if everything is NaN except the school name column and if so, return empty dataframe 
    # get all columns except the school name column
    columns_to_check = df.columns.tolist()
    columns_to_check.remove('school.name')
    if df[columns_to_check].isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    return df

In [6]:
top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()

aggregated_df = pd.DataFrame()
for institution in top_institutions:
    df_admission_data = get_school_admission_data(institution)
    if not df_admission_data.empty:
        # Initialize aggregated_df with the first non-empty dataframe
        if aggregated_df.empty:
            aggregated_df = df_admission_data
        else:
            if not df_admission_data.isna().all().all():
                aggregated_df = pd.concat([aggregated_df, df_admission_data], ignore_index=True)

aggregated_df

Getting admission data for Stanford University
Getting admission data for Washington University in St. Louis
Getting admission data for University of Wisconsin-Madison
Getting admission data for University of Michigan-Ann Arbor
Getting admission data for University of California-San Francisco
No admission data found for University of California-San Francisco
Getting admission data for University of North Carolina-Chapel Hill
Getting admission data for Massachusetts Institute of Technology
Getting admission data for Johns Hopkins University
Getting admission data for University of Pittsburgh
Getting admission data for University of California-Los Angeles
Getting admission data for Boston University
Getting admission data for University of California-San Diego
Getting admission data for Yale University
Getting admission data for Cornell University
No admission data found for Cornell University
Getting admission data for Columbia University
Getting admission data for University of Minneso

Unnamed: 0,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black,school.name
0,0.0391,0.2747,0.2416,0.1766,0.0791,Stanford University
1,0.1196,0.2115,0.4147,0.1256,0.0891,Washington University in St Louis
2,0.4335,0.1051,0.6035,0.0831,0.025,University of Wisconsin-Madison
3,0.1794,0.1863,0.4884,0.1003,0.0454,University of Michigan-Ann Arbor
4,0.1874,0.1483,0.5407,0.0979,0.0818,University of North Carolina at Chapel Hill
5,0.0474,0.3461,0.2102,0.1453,0.0866,Massachusetts Institute of Technology
6,0.0756,0.2663,0.2001,0.2071,0.0949,Johns Hopkins University
7,0.497,0.1489,0.6104,0.0675,0.0546,University of Pittsburgh-Pittsburgh Campus
8,0.9193,0.1014,0.0406,0.7716,0.0397,California State University-Los Angeles
9,0.1085,0.1988,0.3285,0.1131,0.0596,Boston University


In [7]:
df_proxy_data = pd.read_csv("proxy_data.csv")

# print the first 5 rows
print(df_proxy_data.head())

# print the shape of the dataframe
print(df_proxy_data.shape)

# print the columns of the dataframe

   unitid                                institution  year  tuition_fees  \
0  100663        University of Alabama at Birmingham  2023        8832.0   
1  104151  Arizona State University Campus Immersion  2023       12051.0   
2  104179                      University of Arizona  2023       13626.0   
3  110662       University of California-Los Angeles  2023       13747.0   
4  110680         University of California-San Diego  2023       15265.0   

   full_time_ug_enrollment  pct_asian  pct_black  pct_hispanic  pct_native  \
0                     9841          8         22             6           0   
1                    59707          8          4            23           0   
2                    34237          5          4            25           0   
3                    32472         27          4            21           0   
4                    32852         31          2            22           0   

   pct_white  ...  foreign_ug_num  foreign_ug_pct  pct_admitted  yield  \


In [8]:
#merge df_proxy_data on 'institution' and aggregate_df on 'school.name'
df_merged = pd.merge(df_proxy_data, aggregated_df, left_on='institution', right_on='school.name', how='left')

In [9]:
df_merged

Unnamed: 0,unitid,institution,year,tuition_fees,full_time_ug_enrollment,pct_asian,pct_black,pct_hispanic,pct_native,pct_white,...,name,state,par_mean,par_median,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black,school.name
0,100663,University of Alabama at Birmingham,2023,8832.0,9841,8,22,6,0,49,...,University Of Alabama At Birmingham,AL,92744.637908,74600,,,,,,
1,104151,Arizona State University Campus Immersion,2023,12051.0,59707,8,4,23,0,40,...,"California State University, Fullerton",CA,103739.636215,83300,,,,,,
2,104179,University of Arizona,2023,13626.0,34237,5,4,25,0,45,...,University Of Akron,OH,86014.022917,74300,,,,,,
3,110662,University of California-Los Angeles,2023,13747.0,32472,27,4,21,0,26,...,"University Of California, Los Angeles",CA,171784.801104,105500,,,,,,
4,110680,University of California-San Diego,2023,15265.0,32852,31,2,22,0,19,...,"University Of California, San Diego",CA,176468.657437,111300,0.2452,0.3432,0.1818,0.2526,0.0179,University of California-San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,236948,University of Washington-Seattle Campus,2023,12643.0,29863,20,3,8,0,31,...,University Of Washington System,WA,155928.359373,108100,,,,,,
67,239169,Medical College of Wisconsin,2023,,0,14,6,10,0,56,...,Medaille College,NY,82225.447939,72100,,,,,,
68,240444,University of Wisconsin-Madison,2023,11205.0,34278,9,3,8,0,57,...,University Of Wisconsin System,WI,125883.243735,96100,0.4335,0.1051,0.6035,0.0831,0.0250,University of Wisconsin-Madison
69,243744,Stanford University,2023,62484.0,7841,22,6,13,0,25,...,Stanford University,CA,472210.529162,172600,0.0391,0.2747,0.2416,0.1766,0.0791,Stanford University


In [10]:
df_merged.to_csv("college_analysis.csv", index=False)

In [11]:
df_merged.columns

Index(['unitid', 'institution', 'year', 'tuition_fees',
       'full_time_ug_enrollment', 'pct_asian', 'pct_black', 'pct_hispanic',
       'pct_native', 'pct_white', 'in_state_ug_num', 'in_state_ug_pct',
       'out_of_state_ug_num', 'out_of_state_ug_pct', 'foreign_ug_num',
       'foreign_ug_pct', 'pct_admitted', 'yield', 'pct_pell', 'xpgrnt_p',
       'name', 'state', 'par_mean', 'par_median',
       'latest.admissions.admission_rate.overall',
       'latest.student.demographics.race_ethnicity.asian',
       'latest.student.demographics.race_ethnicity.white',
       'latest.student.demographics.race_ethnicity.hispanic',
       'latest.student.demographics.race_ethnicity.black', 'school.name'],
      dtype='object')

In [12]:
df = df_merged

In [13]:
# Define the logic for estimating public vs. private school %
# Heuristic logic based on socioeconomic proxies:
# - Higher Pell Grant % → likely more public school
# - Higher in-state % → likely more public school
# - Higher parent income → likely more private school
# We'll combine these heuristics linearly.

# Normalize relevant columns
df['pct_pell_norm'] = df['pct_pell'] / 100
df['in_state_ug_pct_norm'] = df['in_state_ug_pct'] / 100
df['par_mean_norm'] = (df['par_mean'] - df['par_mean'].min()) / (df['par_mean'].max() - df['par_mean'].min())

# Define weights for the heuristic (you can tweak this based on literature / assumptions)
w_pell = 0.4
w_in_state = 0.4
w_income = 0.2

# Estimate "Public school % score"
df['public_school_score'] = (
    w_pell * df['pct_pell_norm'] +
    w_in_state * df['in_state_ug_pct_norm'] +
    w_income * (1 - df['par_mean_norm'])  # higher income → less public
)

# Clip scores between 0 and 1
df['public_school_pct'] = (df['public_school_score']).clip(0, 1)

# Private school % is complementary
df['private_school_pct'] = 1 - df['public_school_pct']

# Prepare data for plot
plot_df = df[['institution', 'public_school_pct', 'private_school_pct']].dropna().sort_values(by='public_school_pct', ascending=False)



fig = go.Figure(data=[
    go.Bar(name='Public School %', x=plot_df['institution'], y=plot_df['public_school_pct'] * 100),
    go.Bar(name='Private School %', x=plot_df['institution'], y=plot_df['private_school_pct'] * 100)
])

fig.update_layout(
    barmode='stack',
    title='Estimated % of Students from Public vs Private Schools',
    xaxis_title='Institution',
    yaxis_title='Percentage of Students',
    xaxis_tickangle=-45,
    height=600,
    width=1200
)

fig.show()
