# School Common Data Set Degrees Conferred Data Cleaner

The datasets are seperated by college, with each college having multiple `.pdfs` files for each academic year. This notebook aims to clean the data from 2014-2024 by scraping the pdf for the Degrees Conferred section, and generating datasets for each year the colleges, and overall. 

In [4]:
# import libraries

import pdfplumber
import pandas as pd
import numpy as np
import os

In [None]:
# Pull degrees conferred tables from CDS PDFs

def extract_table_from_pdf(pdf_path, pdf_name, full_processed_csvs_folder):
    df_results = pd.DataFrame()
    print(f"Processing file {pdf_name}")
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate over the pages in the PDF
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract text content from the page
            text = page.extract_text()
            # Check if "degrees conferred" exists in the page text (case insensitive)
            if "degrees conferred" in text.lower():
                print(f"Found 'degrees conferred' on page {page_num}. Extracting table...")
                tables = page.extract_tables()
                for table in tables:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df.columns = df.columns.str.replace('\n', ' ', regex=True)
                    df_results = pd.concat([df_results, df], ignore_index=True)
                special_cases = ["ucsd_2023", "wsu_2023"]
                extra_special_cases = ["caltech_2021", "caltech_2022", "caltech_2023", "asu_2021", "asu_2022", "asu_2023"]
                if pdf_name in special_cases : # handle special case where table is split across two pages on a case-by-case basis
                    next_page = pdf.pages[page_num]
                    next_tables = next_page.extract_tables()
                    if next_tables:
                        for table in next_tables:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            top_row = df.columns
                            df.columns = df_results.columns
                            df.loc[-1] = top_row 
                            df.index = df.index + 1
                            df = df.sort_index()
                            df_results = pd.concat([df_results, df], ignore_index=True)
                if pdf_name in extra_special_cases : # handle special case where table is split across two pages and the column names are on the second page
                    next_page = pdf.pages[page_num]
                    next_tables = next_page.extract_tables()
                    if next_tables:
                        for table in next_tables:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            df_results = pd.concat([df_results, df], ignore_index=True)
                break
        if df_results.empty:
            print(f"'degrees conferred' not found in page {page_num}.")

    if not df_results.empty:
        df_results = df_results.reset_index(drop=True)
        year_directory = os.path.join(full_processed_csvs_folder, pdf_name.split("_")[1])
        if not os.path.exists(year_directory):
            os.makedirs(year_directory)
        file_name = os.path.join(year_directory, f"{pdf_name}_degrees_conferred.csv")
        df_results.to_csv(file_name, index=False)
        print(f"Table saved to {file_name}.")

def extract_all_tables(cds_data_folder, full_processed_csvs_folder):
    for college_folder in os.listdir(cds_data_folder):
        if college_folder == "full_processed_csvs" or college_folder == "useful_csvs" or college_folder == ".DS_Store" or college_folder == "manual":
            continue
        college_folder_path = os.path.join(cds_data_folder, college_folder)
        for pdf_file in os.listdir(college_folder_path):
            if pdf_file.endswith('.pdf'):
                pdf_file_path = os.path.join(college_folder_path, pdf_file)
                extract_table_from_pdf(pdf_file_path, pdf_file.split(".")[0], full_processed_csvs_folder)

extract_all_tables("cds_data", "cds_data/full_processed_csvs")

In [None]:
# Clean datasets by merging columns with similar names and combining data from different years

def merge_column_names(file_path):
    df = pd.read_csv(file_path)
    if "Diploma/ Certificates" in df.columns:
        df = df.rename(columns={"Diploma/ Certificates": "Diploma/Certificates"})
    elif "Diplomas / Certificates" in df.columns:
        df = df.rename(columns={"Diplomas / Certificates": "Diploma/Certificates"})
    elif "\"Diploma/\nCertificates\"" in df.columns:
        df = df.rename(columns={"\"Diploma/\nCertificates\"": "Diploma/Certificates"})
    if "CIP 2010 Categories to Include" in df.columns:
        df = df.rename(columns={"CIP 2010 Categories to Include": "CIP Code"})
    elif "CIP Code Number" in df.columns:
        df = df.rename(columns={"CIP Code Number": "CIP Code"})
    elif "CIP 2020 Categories to Include" in df.columns:
        df = df.rename(columns={"CIP 2020 Categories to Include": "CIP Code"})
    elif "CIP 2021 Categories to Include" in df.columns:
        df = df.rename(columns={"CIP 2021 Categories to Include": "CIP Code"})
    elif "CIP202 Categories to Include" in df.columns:
        df = df.rename(columns={"CIP202 Categories to Include": "CIP Code"})
    elif "\"CIP202\nCategories\nto\nInclude\"" in df.columns:
        df = df.rename(columns={"\"CIP202\nCategories\nto\nInclude\"": "CIP Code"})
    if "Category (UM-Ann Arbor grants Bachelor's degrees; no undergraduate Diploma/Certificates or Associate degrees)" in df.columns:
        df = df.rename(columns={"Category (UM-Ann Arbor grants Bachelor's degrees; no undergraduate Diploma/Certificates or Associate degrees)": "Category"})
    if "Unnamed: 2" in df.columns:
        df = df.rename(columns={"Unnamed: 2": "Bachelor's"})
    if "Bachelor’s degrees (First majors)" in df.columns:
        df = df.drop(columns=["Bachelor’s degrees (First majors)"])
    return df
    
def merge_all_column_names(full_processed_csvs_folder):
    for year_folder in os.listdir(full_processed_csvs_folder):
        if year_folder == ".DS_Store" or year_folder == "combined":
            continue
        year_folder_path = os.path.join(full_processed_csvs_folder, year_folder)
        for college_file in os.listdir(year_folder_path):
            if college_file.endswith('.csv'):
                college_file_path = os.path.join(year_folder_path, college_file)
                df = merge_column_names(college_file_path)
                df.to_csv(college_file_path, index=False)

def combine_year_data(full_processed_csvs_folder):
    for year_folder in os.listdir(full_processed_csvs_folder):
        if year_folder == ".DS_Store" or year_folder == "combined":
            continue
        year_folder_path = os.path.join(full_processed_csvs_folder, year_folder)
        combined_df = pd.DataFrame()
        for college_file in os.listdir(year_folder_path):
            if college_file.endswith('.csv'):
                college_file_path = os.path.join(year_folder_path, college_file)
                df = pd.read_csv(college_file_path)
                df["college"] = college_file.split("_")[0]
                df["year"] = year_folder
                combined_df = pd.concat([combined_df, df], ignore_index=True)
        combined_folder = "cds_data/useful_csvs"
        if not os.path.exists(combined_folder):
            os.makedirs(combined_folder)
        combined_df.to_csv(os.path.join(combined_folder, f"{year_folder}_combined.csv"), index=False)

merge_all_column_names("cds_data/full_processed_csvs")
combine_year_data("cds_data/full_processed_csvs")

In [28]:
# Fix UMich data :///
for combined_csv in os.listdir("cds_data/useful_csvs"):
    df = pd.read_csv(os.path.join("cds_data/useful_csvs", combined_csv))
    df.replace("--", "", inplace=True)
    if "Bachelor's" in df.columns:
        df['Bachelor’s'] = df['Bachelor’s'].combine_first(df["Bachelor's"])
        df = df.drop(columns=["Bachelor's"])
    df.to_csv(os.path.join("cds_data/useful_csvs", combined_csv), index=False)

In [29]:
# Combine data from different years into a single dataset

def combine_years(useful_csv_path):
    combined_df = pd.DataFrame()
    for file in os.listdir(useful_csv_path):
        if file.endswith('.csv') and file != 'all_years.csv':
            file_path = os.path.join(useful_csv_path, file)
            df = pd.read_csv(file_path)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_df.dropna(axis=1, how='all')
    column_order = ['year', 'college', 'CIP Code', 'Category', 'Bachelor’s', 'Associate', 'Diploma/Certificates']
    combined_df = combined_df[column_order]
    combined_df = combined_df.sort_values(by=['year', 'college'], ascending=[True, True], kind='mergesort')
    combined_df.to_csv('cds_data/useful_csvs/all_years.csv', index=False)

combine_years('cds_data/useful_csvs')