In [1]:
import requests
import pandas as pd
import os
import numpy as np
import re

from dotenv import load_dotenv
import os

from json import JSONDecodeError

# Now create an interactive Plotly bar chart
import plotly.graph_objects as go

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
API_KEY = os.getenv('API_KEY')

Reference: https://support.qs.com/hc/en-gb/articles/4410488025106-QS-World-University-Rankings-by-Subject
### The rankings columns are:
* Academic Reputation (30% weight)
-- The Academic Reputation (AR) indicator measures the reputation of institutions and their programmes by asking academic experts to nominate universities based on their subject area of expertise. Pioneered by QS in 2004, it asks the question: which universities are demonstrating academic excellence? To answer this we collect and distil the collective intelligence of academics from around the world via our Academic Survey, evaluating nominations for approximately 7000 institutions each year.The indicator not only illuminates the quality of an institution's research, but also their approach to academic partnerships, their strategic impact, their educational innovativeness and the impact they have made on education and society at large.
The indicator is the centrepiece of almost all of the rankings across the QS portfolio. 

* Employer Reputation (15% weight)
-- The Employer Reputation (ER) indicator measures the reputation of institutions and their programmes among employers. We remain the only major ranking to focus on this vital aspect of a student's educational journey.

* Citations per Paper
-- The Citations per Paper (CPP) indicator measures the impact and quality of the scientific work done by institutions, on average per publication.

* H-Index
-- The h-index is an index that attempts to measure both the productivity and impact of the published work of a scientist or scholar. The index is based on the set of the scientist’s most cited papers and the number of citations that they have received in other publications. It can also be applied to the productivity and impact of a group of scientists, such as a department, or an institution (as in the case of our indicator), or a country, as well as a scholarly journal. The index is defined as the maximum value of h such that the given entity (author, journal, department, institution, etc.) has published at least h papers that have each been cited at least h times (https://doi.org/10.1073/pnas.0507655102). We use institution-level H Index.

* International Research Network
-- International Research Network (IRN) is a measure of an institution's success in creating and sustaining research partnerships with institutions in other locations. The indicator measures how diverse and rich an institution's research network is by looking at the number of different countries represented, and whether these relationships are renewed and repeated. We only consider sustained partnerships, defined as those which result in three or more joint papers published in a five-year period.


In [3]:
def get_top_us_institutions_for_life_sciences(num_institutions=200):
    file_path = '2025_QS_rankings.xlsx'
    # Reload with correct settings: skip to row 10 (0-based), treat row 10 as header
    df_qs = pd.read_excel(file_path, sheet_name="Life Sciences & Medicine", skiprows=10, header=0)

    # Drop rows with missing Institution (bottom padding, if any)
    df_qs = df_qs.dropna(subset=["Institution"])

    # Select top 100 programs
    df_top = df_qs.head(num_institutions)

    # Select relevant columns
    df_top = df_top[[
        "2025", "Institution", "Country / Territory", "Score", "Academic", "Employer", "Citations", "H", "IRN"
    ]]

    # Rename 2025 column to "Rank" for clarity
    df_top = df_top.rename(columns={"2025": "Rank"})
    # Clean up the rank column that has = sign prefixed and make it an integer
    df_top['Rank'] = df_top['Rank'].str.replace('=', '')
    df_top['Rank'] = df_top['Rank'].astype(int)

    # rank the institutions by overall Rank and H-index (higher the H-index, better the institution)
    df_top = df_top.sort_values(by=['Rank', 'H'], ascending=True)

    # consider only US instituions and rank by H-index (higher the H-index, better the institution) and drop the country column
    df_top_us = df_top[df_top['Country / Territory'] == 'United States of America']
    df_top_us = df_top_us.drop(columns=['Country / Territory'])
    print(df_top_us.head(10))

    # clean up the institution names
    list_of_institutions = df_top_us['Institution'].values.tolist()
    cleaned_institutions = []
    for institution in list_of_institutions:
        cleaned = re.sub(r'\([^)]*\)', '', institution)
        cleaned = cleaned.replace(',', '-')
        cleaned = cleaned.strip()
        cleaned_institutions.append(cleaned)

    # no space before and after '-' within the string if there are space
    cleaned_institutions = [re.sub(r'\s*-\s*', '-', institution) for institution in cleaned_institutions]

    # remove any duplicates
    cleaned_institutions = list(dict.fromkeys(cleaned_institutions))

    # remove any empty strings
    cleaned_institutions = [institution for institution in cleaned_institutions if institution]

    return cleaned_institutions, df_top_us

In [None]:
def test_top_100_us_institutions_for_life_sciences():
    top_institutions, df_top100_us = get_top_us_institutions_for_life_sciences(100)
    print(top_institutions)

test_top_100_us_institutions_for_life_sciences()

In [5]:
def get_school_admission_data(school_name):
    print(f"Getting admission data for {school_name}")
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    params = {
        "api_key": API_KEY,
        "school.name": school_name,
        "fields": "school.name,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black",
        "per_page": 1
    }

    response = requests.get(base_url, params=params)
    try:
        data = response.json()
        # Convert to pandas dataframe
        df = pd.json_normalize(data['results'])
    except JSONDecodeError as e:
        print(f"Error getting admission data for {school_name}: {e}")
        return pd.DataFrame()

    # if df is all NAN, return empty dataframe
    if df.isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    # check if everything is NaN except the school name column and if so, return empty dataframe 
    # get all columns except the school name column
    columns_to_check = df.columns.tolist()
    columns_to_check.remove('school.name')
    if df[columns_to_check].isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    return df

In [None]:
top_institutions, df_top_us = get_top_us_institutions_for_life_sciences()

aggregated_df = pd.DataFrame()
for institution in top_institutions:
    df_admission_data = get_school_admission_data(institution)
    if not df_admission_data.empty:
        # Initialize aggregated_df with the first non-empty dataframe
        if aggregated_df.empty:
            aggregated_df = df_admission_data
        else:
            if not df_admission_data.isna().all().all():
                aggregated_df = pd.concat([aggregated_df, df_admission_data], ignore_index=True)

aggregated_df.head()

# write aggregated_df to csv
aggregated_df.to_csv("top_us_institutions.csv", index=False)

In [None]:
df_proxy_data = pd.read_csv("proxy_data.csv")

# print the first 5 rows
print(df_proxy_data.head())

# print the shape of the dataframe
print(df_proxy_data.shape)

# print the columns of the dataframe

In [8]:
#merge df_proxy_data on 'institution' and aggregate_df on 'school.name'
df_merged = pd.merge(df_proxy_data, aggregated_df, left_on='institution', right_on='school.name', how='left')

In [None]:
df_merged

In [10]:
#df_merged.to_csv("college_analysis.csv", index=False)

In [None]:
df_merged.columns

In [12]:
df = df_merged

In [None]:
# Define the logic for estimating public vs. private school %
# Heuristic logic based on socioeconomic proxies:
# - Higher Pell Grant % → likely more public school
# - Higher in-state % → likely more public school
# - Higher parent income → likely more private school
# We'll combine these heuristics linearly.

# Normalize relevant columns
df['pct_pell_norm'] = df['pct_pell'] / 100
df['in_state_ug_pct_norm'] = df['in_state_ug_pct'] / 100
df['par_mean_norm'] = (df['par_mean'] - df['par_mean'].min()) / (df['par_mean'].max() - df['par_mean'].min())

# Define weights for the heuristic (you can tweak this based on literature / assumptions)
w_pell = 0.4
w_in_state = 0.4
w_income = 0.2

# Estimate "Public school % score"
df['public_school_score'] = (
    w_pell * df['pct_pell_norm'] +
    w_in_state * df['in_state_ug_pct_norm'] +
    w_income * (1 - df['par_mean_norm'])  # higher income → less public
)

# Clip scores between 0 and 1
df['public_school_pct'] = (df['public_school_score']).clip(0, 1)

# Private school % is complementary
df['private_school_pct'] = 1 - df['public_school_pct']

# Prepare data for plot
plot_df = df[['institution', 'public_school_pct', 'private_school_pct']].dropna().sort_values(by='public_school_pct', ascending=False)



fig = go.Figure(data=[
    go.Bar(name='Public School %', x=plot_df['institution'], y=plot_df['public_school_pct'] * 100),
    go.Bar(name='Private School %', x=plot_df['institution'], y=plot_df['private_school_pct'] * 100)
])

fig.update_layout(
    barmode='stack',
    title='Estimated % of Students from Public vs Private Schools',
    xaxis_title='Institution',
    yaxis_title='Percentage of Students',
    xaxis_tickangle=-45,
    height=600,
    width=1200
)

fig.show()
