In [2]:
import requests
import pandas as pd
import os
import numpy as np
import re

from dotenv import load_dotenv
import os

from json import JSONDecodeError

In [3]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
API_KEY = os.getenv('API_KEY')

Reference: https://support.qs.com/hc/en-gb/articles/4410488025106-QS-World-University-Rankings-by-Subject
### The rankings columns are:
* Academic Reputation (30% weight)
-- The Academic Reputation (AR) indicator measures the reputation of institutions and their programmes by asking academic experts to nominate universities based on their subject area of expertise. Pioneered by QS in 2004, it asks the question: which universities are demonstrating academic excellence? To answer this we collect and distil the collective intelligence of academics from around the world via our Academic Survey, evaluating nominations for approximately 7000 institutions each year.The indicator not only illuminates the quality of an institution's research, but also their approach to academic partnerships, their strategic impact, their educational innovativeness and the impact they have made on education and society at large.
The indicator is the centrepiece of almost all of the rankings across the QS portfolio. 

* Employer Reputation (15% weight)
-- The Employer Reputation (ER) indicator measures the reputation of institutions and their programmes among employers. We remain the only major ranking to focus on this vital aspect of a student's educational journey.

* Citations per Paper
-- The Citations per Paper (CPP) indicator measures the impact and quality of the scientific work done by institutions, on average per publication.

* H-Index
-- The h-index is an index that attempts to measure both the productivity and impact of the published work of a scientist or scholar. The index is based on the set of the scientist’s most cited papers and the number of citations that they have received in other publications. It can also be applied to the productivity and impact of a group of scientists, such as a department, or an institution (as in the case of our indicator), or a country, as well as a scholarly journal. The index is defined as the maximum value of h such that the given entity (author, journal, department, institution, etc.) has published at least h papers that have each been cited at least h times (https://doi.org/10.1073/pnas.0507655102). We use institution-level H Index.

* International Research Network
-- International Research Network (IRN) is a measure of an institution's success in creating and sustaining research partnerships with institutions in other locations. The indicator measures how diverse and rich an institution's research network is by looking at the number of different countries represented, and whether these relationships are renewed and repeated. We only consider sustained partnerships, defined as those which result in three or more joint papers published in a five-year period.


In [12]:
def get_top_100_us_institutions_for_life_sciences():
    file_path = '2025_QS_rankings.xlsx'
    # Reload with correct settings: skip to row 10 (0-based), treat row 10 as header
    df_qs = pd.read_excel(file_path, sheet_name="Life Sciences & Medicine", skiprows=10, header=0)

    # Drop rows with missing Institution (bottom padding, if any)
    df_qs = df_qs.dropna(subset=["Institution"])

    # Select top 100 programs
    df_top100 = df_qs.head(100)

    # Select relevant columns
    df_top100 = df_top100[[
        "2025", "Institution", "Country / Territory", "Score", "Academic", "Employer", "Citations", "H", "IRN"
    ]]

    # Rename 2025 column to "Rank" for clarity
    df_top100 = df_top100.rename(columns={"2025": "Rank"})
    # Clean up the rank column that has = sign prefixed and make it an integer
    df_top100['Rank'] = df_top100['Rank'].str.replace('=', '')
    df_top100['Rank'] = df_top100['Rank'].astype(int)

    # consider only US instituions and rank by H-index (higher the H-index, better the institution) and drop the country column
    df_top100_us = df_top100[df_top100['Country / Territory'] == 'United States of America']
    df_top100_us = df_top100_us.drop(columns=['Country / Territory'])

    # clean up the institution names
    list_of_institutions = df_top100_us['Institution'].values.tolist()
    cleaned_institutions = []
    for institution in list_of_institutions:
        cleaned = re.sub(r'\([^)]*\)', '', institution)
        cleaned = cleaned.replace(',', '-')
        cleaned = cleaned.strip()
        cleaned_institutions.append(cleaned)

    # no space before and after '-' within the string if there are space
    cleaned_institutions = [re.sub(r'\s*-\s*', '-', institution) for institution in cleaned_institutions]

    # remove any duplicates
    cleaned_institutions = list(set(cleaned_institutions))

    # remove any empty strings
    cleaned_institutions = [institution for institution in cleaned_institutions if institution]


    return cleaned_institutions, df_top100_us

In [None]:
def test_top_100_us_institutions_for_life_sciences():
    top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()
    print(top_institutions)

test_top_100_us_institutions_for_life_sciences()

In [14]:
def get_school_admission_data(school_name):
    print(f"Getting admission data for {school_name}")
    base_url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    params = {
        "api_key": API_KEY,
        "school.name": school_name,
        "fields": "school.name,latest.admissions.admission_rate.overall,latest.student.demographics.race_ethnicity.asian,latest.student.demographics.race_ethnicity.white,latest.student.demographics.race_ethnicity.hispanic,latest.student.demographics.race_ethnicity.black",
        "per_page": 1
    }

    response = requests.get(base_url, params=params)
    try:
        data = response.json()
        # Convert to pandas dataframe
        df = pd.json_normalize(data['results'])
    except JSONDecodeError as e:
        print(f"Error getting admission data for {school_name}: {e}")
        return pd.DataFrame()

    # if df is all NAN, return empty dataframe
    if df.isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    # check if everything is NaN except the school name column and if so, return empty dataframe 
    # get all columns except the school name column
    columns_to_check = df.columns.tolist()
    columns_to_check.remove('school.name')
    if df[columns_to_check].isna().all().all():
        print(f"No admission data found for {school_name}")
        return pd.DataFrame()
    
    return df

In [None]:
top_institutions, df_top100_us = get_top_100_us_institutions_for_life_sciences()

aggregated_df = pd.DataFrame()
for institution in top_institutions:
    df_admission_data = get_school_admission_data(institution)
    if not df_admission_data.empty:
        # Initialize aggregated_df with the first non-empty dataframe
        if aggregated_df.empty:
            aggregated_df = df_admission_data
        else:
            if not df_admission_data.isna().all().all():
                aggregated_df = pd.concat([aggregated_df, df_admission_data], ignore_index=True)

aggregated_df

In [None]:
# match_institutions_unitid.ipynb

# -------------------------------
# 1️⃣ Imports
# -------------------------------
import pandas as pd
from difflib import get_close_matches

# -------------------------------
# 2️⃣ Load Institution List
# -------------------------------
# Your list of institutions
df_my_institutions = pd.read_csv("ipeds_institution_list.csv")
my_institution_list = df_my_institutions["Institution"].str.strip().tolist()

print(f"✅ Loaded {len(my_institution_list)} institutions from ipeds_institution_list.csv")

# -------------------------------
# 3️⃣ Load IPEDS HD2023.csv
# -------------------------------
# Download from: https://nces.ed.gov/ipeds/datacenter/DataFiles.aspx → Institutional Characteristics → Header (HD2023.csv)

df_hd = pd.read_csv("hd2023.csv", low_memory=False, encoding='latin1')

df_hd["INSTNM_clean"] = df_hd["INSTNM"].str.strip().str.lower()

#assert('arizona state university' in df_hd["INSTNM_clean"])

print(f"✅ Loaded HD2023.csv with {len(df_hd)} institutions")

# -------------------------------
# 4️⃣ Institution Matching
# -------------------------------
matched_rows = []

def find_best_match(inst_clean: str, df_hd: pd.DataFrame) -> tuple[str, float]:
    """Find best match using prefix matching first, then fuzzy matching."""
    # Try prefix matching first
    prefix_matches = df_hd[df_hd["INSTNM_clean"].str.startswith(inst_clean)]
    if not prefix_matches.empty:
        return prefix_matches.iloc[0]["INSTNM_clean"], 1.0
    
    # If no prefix match, try fuzzy matching
    matches = get_close_matches(inst_clean, df_hd["INSTNM_clean"].tolist(), n=3, cutoff=0.6)
    if not matches:
        return "NO MATCH FOUND", 0.0
    
    # Get the best match based on word overlap
    best_match = None
    best_score = 0.0
    
    for match in matches:
        inst_words = set(inst_clean.split())
        match_words = set(match.split())
        common_words = inst_words.intersection(match_words)
        
        score = len(common_words) / max(len(inst_words), len(match_words))
        if score > best_score:
            best_score = score
            best_match = match
    
    return best_match, best_score

for inst in my_institution_list:
    inst_clean = inst.strip().lower()
    best_match, score = find_best_match(inst_clean, df_hd)
    
    if best_match != "NO MATCH FOUND" and score >= 0.5:
        unitid = df_hd[df_hd["INSTNM_clean"] == best_match]["UNITID"].values[0]
        matched_rows.append({
            "Institution": inst,
            "Matched_Name": best_match,
            "UNITID": unitid,
            "Match_Score": score
        })
    else:
        matched_rows.append({
            "Institution": inst,
            "Matched_Name": "NO MATCH FOUND",
            "UNITID": None,
            "Match_Score": 0.0
        })

# -------------------------------
# 5️⃣ Create DataFrame & Save
# -------------------------------
df_matches = pd.DataFrame(matched_rows)
# make the unitid an integer before saving, output the list for subsequent use
df_matches["UNITID"] = df_matches["UNITID"].fillna(-1).astype(int)
# remove any rows where the unitid is -1
df_matches = df_matches[df_matches["UNITID"] != -1]

# output the list for subsequent use
unitid_list = df_matches["UNITID"].tolist()
print(unitid_list)

# Save result
df_matches[["Institution", "UNITID"]].to_csv("institution_unitid_matched.csv", index=False)

# Display summary
print("✅ Matching complete!")
print(f"Total institutions: {len(df_matches)}")
print(f"Matches found: {df_matches['UNITID'].notnull().sum()}")

# Show first 10 matches as preview
df_matches.head(10)


In [None]:
import os
import pandas as pd
from functools import reduce

# Get all CSV files from ipeds-all directory
ipeds_dir = "ipeds-all"
csv_files = [f for f in os.listdir(ipeds_dir) if f.endswith('.csv')]

# Read and merge all CSV files
dfs = []
for file in csv_files:
    file_path = os.path.join(ipeds_dir, file)
    df = pd.read_csv(file_path)
    
    # Standardize column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Drop duplicate columns before adding to dfs list
    df = df.loc[:, ~df.columns.duplicated()]
    dfs.append(df)

# Merge all dataframes on UNITID using reduce with suffixes to handle duplicate columns
merged_df = reduce(
    lambda left, right: pd.merge(
        left, 
        right, 
        on='unitid', 
        how='outer',
        suffixes=('', f'_{len(dfs)}')  # Use empty suffix for left, numbered suffix for right
    ), 
    dfs
)

# Save merged dataframe
merged_df.to_csv('iped_data.csv', index=False)

print(f"✅ Successfully merged {len(csv_files)} files into iped_data.csv")
print(f"Final shape: {merged_df.shape}")


In [None]:
# load iped_data.csv
import pandas as pd
df = pd.read_csv("iped_data.csv")

# print the first 5 rows
print(df.head())

# print the shape of the dataframe
print(df.shape)

# print the columns of the dataframe

In [None]:
df.columns

In [26]:
# drop 'institution name_4', 'institution name_4.1', 'institution name_4.2' 'year_4', 'year_4.1', 'year_4.2'
df.drop(columns=['institution name_4', 'institution name_4.1', 'institution name_4.2', 'year_4', 'year_4.1', 'year_4.2'], inplace=True)

# drop any rows where the unitid is -1
df = df[df['unitid'] != -1]

# save the dataframe to a csv file
df.to_csv('iped_data_cleaned.csv', index=False)

In [None]:
# load iped_data.csv
import pandas as pd
df = pd.read_csv("iped_data_cleaned.csv")

# print the first 5 rows
print(df.head())

# print the shape of the dataframe
print(df.shape)

# print the columns of the dataframe

In [None]:
df.columns

In [None]:
# read mrc_salary_data.csv
import pandas as pd
df = pd.read_csv("mrc_salary_table.csv")

# print the first 5 rows
print(df.head())

# print the shape of the dataframe
print(df.shape)

# print the columns of the dataframe

In [None]:
df.columns

In [None]:
df2 = df[['name', 'state', 'par_mean', 'par_median']]
df2

In [35]:
df2.to_csv("mrc_salary_table_cleaned.csv", index=False)

In [12]:
df_s = pd.read_csv("mrc_salary_table_cleaned.csv")
df_s

Unnamed: 0,name,state,par_mean,par_median
0,ASA Institute Of Business & Computer Technology,NY,35390.396804,29000
1,Abilene Christian University,TX,138760.969806,101000
2,Abraham Baldwin Agricultural College,GA,80366.661268,66000
3,Academy Of Art University,CA,166594.969612,92300
4,Adams State University,CO,76121.816340,67200
...,...,...,...,...
2197,Yuba Community College District,CA,61468.370708,48700
2198,Zane State College,OH,65763.334635,53800
2199,Late College Goers,,55858.262836,43300
2200,Never Attended College (up to year 2013),,48010.288464,35200


In [14]:
import pandas as pd

df = pd.read_csv("iped_data_cleaned.csv")

# print the first 5 rows
print(df.head())

# print the shape of the dataframe
print(df.shape)

# print the columns of the dataframe

   unitid                                institution  year  tuition_fees  \
0  100663        University of Alabama at Birmingham  2023        8832.0   
1  104151  Arizona State University Campus Immersion  2023       12051.0   
2  104179                      University of Arizona  2023       13626.0   
3  110662       University of California-Los Angeles  2023       13747.0   
4  110680         University of California-San Diego  2023       15265.0   

   full_time_ug_enrollment  pct_asian  pct_black  pct_hispanic  pct_native  \
0                     9841          8         22             6           0   
1                    59707          8          4            23           0   
2                    34237          5          4            25           0   
3                    32472         27          4            21           0   
4                    32852         31          2            22           0   

   pct_white  in_state_ug_num  in_state_ug_pct  out_of_state_ug_num  \
0  

In [15]:
# merge df on institution and df_s on name using maximum prefix match and if it fails on get_closest fuzzy match
from difflib import SequenceMatcher
import numpy as np

def get_closest(name, names_list):
    """Get the closest matching name using fuzzy matching"""
    return max(names_list, key=lambda x: SequenceMatcher(None, name, x).ratio())

def get_prefix_match(name, names_list):
    """Get the longest prefix match from the list of names"""
    matches = [n for n in names_list if name.startswith(n) or n.startswith(name)]
    if matches:
        return max(matches, key=len)
    return None

# Create a new column for matched names
df['matched_name'] = None

# First try prefix matching
for idx, row in df.iterrows():
    prefix_match = get_prefix_match(row['institution'], df_s['name'].tolist())
    if prefix_match:
        df.at[idx, 'matched_name'] = prefix_match
    else:
        # If prefix match fails, use fuzzy matching
        closest_match = get_closest(row['institution'], df_s['name'].tolist())
        df.at[idx, 'matched_name'] = closest_match

# Merge the dataframes
merged_df = pd.merge(df, df_s, left_on='matched_name', right_on='name', how='left')

# Drop the temporary matching column
merged_df = merged_df.drop('matched_name', axis=1)

# Display the first few rows of the merged dataframe
print("Merged dataframe shape:", merged_df.shape)
print("\nFirst few rows of merged dataframe:")
print(merged_df.head())


Merged dataframe shape: (71, 24)

First few rows of merged dataframe:
   unitid                                institution  year  tuition_fees  \
0  100663        University of Alabama at Birmingham  2023        8832.0   
1  104151  Arizona State University Campus Immersion  2023       12051.0   
2  104179                      University of Arizona  2023       13626.0   
3  110662       University of California-Los Angeles  2023       13747.0   
4  110680         University of California-San Diego  2023       15265.0   

   full_time_ug_enrollment  pct_asian  pct_black  pct_hispanic  pct_native  \
0                     9841          8         22             6           0   
1                    59707          8          4            23           0   
2                    34237          5          4            25           0   
3                    32472         27          4            21           0   
4                    32852         31          2            22           0   

   p

In [16]:
len(merged_df)
merged_df.to_csv("proxy_data.csv", index=False)

In [17]:
df = pd.read_csv("proxy_data.csv")

# print the first 5 rows
print(df.head())

# print the shape of the dataframe
print(df.shape)

# print the columns of the dataframe

   unitid                                institution  year  tuition_fees  \
0  100663        University of Alabama at Birmingham  2023        8832.0   
1  104151  Arizona State University Campus Immersion  2023       12051.0   
2  104179                      University of Arizona  2023       13626.0   
3  110662       University of California-Los Angeles  2023       13747.0   
4  110680         University of California-San Diego  2023       15265.0   

   full_time_ug_enrollment  pct_asian  pct_black  pct_hispanic  pct_native  \
0                     9841          8         22             6           0   
1                    59707          8          4            23           0   
2                    34237          5          4            25           0   
3                    32472         27          4            21           0   
4                    32852         31          2            22           0   

   pct_white  ...  foreign_ug_num  foreign_ug_pct  pct_admitted  yield  \


In [18]:
df.columns

Index(['unitid', 'institution', 'year', 'tuition_fees',
       'full_time_ug_enrollment', 'pct_asian', 'pct_black', 'pct_hispanic',
       'pct_native', 'pct_white', 'in_state_ug_num', 'in_state_ug_pct',
       'out_of_state_ug_num', 'out_of_state_ug_pct', 'foreign_ug_num',
       'foreign_ug_pct', 'pct_admitted', 'yield', 'pct_pell', 'xpgrnt_p',
       'name', 'state', 'par_mean', 'par_median'],
      dtype='object')