# School Common Data Set Degrees Conferred Data Cleaner

The datasets are seperated by college, with each college having multiple `.pdfs` files for each academic year. This notebook aims to clean the data from 2014-2024 by scraping the pdf for the Degrees Conferred section, and generating datasets for each year the colleges, and overall. 

In [6]:
import pdfplumber
import pandas as pd
import numpy as np
import os

def extract_table_from_pdf(pdf_path, pdf_name, full_processed_csvs_folder):
    df_results = pd.DataFrame()
    print(f"Processing file {pdf_name}")
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate over the pages in the PDF
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract text content from the page
            text = page.extract_text()
            # Check if "degrees conferred" exists in the page text (case insensitive)
            if "degrees conferred" in text.lower():
                print(f"Found 'degrees conferred' on page {page_num}. Extracting table...")
                tables = page.extract_tables()
                for table in tables:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df.columns = df.columns.str.replace('\n', ' ', regex=True)
                    df_results = pd.concat([df_results, df], ignore_index=True)
                special_cases = ["ucsd_2023", "wsu_2023"]
                extra_special_cases = ["caltech_2021", "caltech_2022", "caltech_2023", "asu_2021", "asu_2022", "asu_2023"]
                if pdf_name in special_cases : # handle special case where table is split across two pages on a case-by-case basis
                    next_page = pdf.pages[page_num]
                    next_tables = next_page.extract_tables()
                    if next_tables:
                        for table in next_tables:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            top_row = df.columns
                            df.columns = df_results.columns
                            df.loc[-1] = top_row 
                            df.index = df.index + 1
                            df = df.sort_index()
                            df_results = pd.concat([df_results, df], ignore_index=True)
                if pdf_name in extra_special_cases : # handle special case where table is split across two pages and the column names are on the second page
                    next_page = pdf.pages[page_num]
                    next_tables = next_page.extract_tables()
                    if next_tables:
                        for table in next_tables:
                            df = pd.DataFrame(table[1:], columns=table[0])
                            df_results = pd.concat([df_results, df], ignore_index=True)
                break
        if df_results.empty:
            print(f"'degrees conferred' not found in page {page_num}.")

    if not df_results.empty:
        df_results = df_results.reset_index(drop=True)
        year_directory = os.path.join(full_processed_csvs_folder, pdf_name.split("_")[1])
        if not os.path.exists(year_directory):
            os.makedirs(year_directory)
        file_name = os.path.join(year_directory, f"{pdf_name}_degrees_conferred.csv")
        df_results.to_csv(file_name, index=False)
        print(f"Table saved to {file_name}.")

def extract_all_tables(cds_data_folder, full_processed_csvs_folder):
    for college_folder in os.listdir(cds_data_folder):
        if college_folder == "full_processed_csvs" or college_folder == "useful_csvs" or college_folder == ".DS_Store" or college_folder == "manual":
            continue
        college_folder_path = os.path.join(cds_data_folder, college_folder)
        for pdf_file in os.listdir(college_folder_path):
            if pdf_file.endswith('.pdf'):
                pdf_file_path = os.path.join(college_folder_path, pdf_file)
                extract_table_from_pdf(pdf_file_path, pdf_file.split(".")[0], full_processed_csvs_folder)

extract_all_tables("cds_data", "cds_data/full_processed_csvs")

Processing file ucsd_2016
Found 'degrees conferred' on page 28. Extracting table...
Table saved to cds_data/full_processed_csvs/2016/ucsd_2016_degrees_conferred.csv.
Processing file ucsd_2017
Found 'degrees conferred' on page 30. Extracting table...
Table saved to cds_data/full_processed_csvs/2017/ucsd_2017_degrees_conferred.csv.
Processing file ucsd_2015
Found 'degrees conferred' on page 28. Extracting table...
Table saved to cds_data/full_processed_csvs/2015/ucsd_2015_degrees_conferred.csv.
Processing file ucsd_2014
Found 'degrees conferred' on page 27. Extracting table...
Table saved to cds_data/full_processed_csvs/2014/ucsd_2014_degrees_conferred.csv.
Processing file ucsd_2023
Found 'degrees conferred' on page 55. Extracting table...
Table saved to cds_data/full_processed_csvs/2023/ucsd_2023_degrees_conferred.csv.
Processing file ucsd_2022
Found 'degrees conferred' on page 27. Extracting table...
Table saved to cds_data/full_processed_csvs/2022/ucsd_2022_degrees_conferred.csv.
Proc