## Graduation data

In [9]:
import pandas as pd
from pathlib import Path

# Configuration
EXCEL_FILE = '/Users/uyennguyen/Projects/DA/Data-Viz-IPEDS/dbt-preprocessing/raw/gr2023dict.xlsx'
OUT_DIR = Path("seeds")
OUT_DIR.mkdir(exist_ok=True)

print(f"Processing {EXCEL_FILE}")
print(f"Output directory: {OUT_DIR}")

Processing /Users/uyennguyen/Projects/DA/Data-Viz-IPEDS/dbt-preprocessing/raw/gr2023dict.xlsx
Output directory: seeds


In [6]:
def export_varlist_columns(excel_file, out_dir):
    """Export variable names and titles for documentation."""
    varlist = pd.read_excel(excel_file, sheet_name="varlist")
    varlist = varlist[["varname", "varTitle"]]
    output_file = out_dir / "varlist_columns.csv"
    varlist.to_csv(output_file, index=False)
    print(f"Exported varlist_columns.csv with {len(varlist)} rows")
    return varlist

def export_frequency_codes(excel_file, out_dir):
    """Export code mappings for each variable from frequency sheet."""
    freq = pd.read_excel(excel_file, sheet_name="Frequencies")
    needed_cols = ["varname", "codevalue", "valuelabel"]
    freq = freq[needed_cols]
    
    exported_count = 0
    for var, subdf in freq.groupby("varname"):
        clean = subdf.rename(columns={
            "codevalue": "code",
            "valuelabel": "label"
        })[["code", "label"]]
        
        output_file = out_dir / f"{var.lower()}_code.csv"
        clean.to_csv(output_file, index=False)
        print(f"Exported: {var.lower()}_code.csv with {len(clean)} rows")
        exported_count += 1
    
    return exported_count

In [8]:
# Execute the data processing
varlist = export_varlist_columns(EXCEL_FILE, OUT_DIR)
exported_count = export_frequency_codes(EXCEL_FILE, OUT_DIR)

print(f"\nProcessing complete!")
print(f"- Exported variable list with {len(varlist)} variables")
# print(f"- Exported {exported_count} code mapping files")

Exported varlist_columns.csv with 36 rows
Exported: chrtstat_code.csv with 16 rows
Exported: cohort_code.csv with 4 rows
Exported: grtype_code.csv with 42 rows
Exported: line_code.csv with 17 rows
Exported: section_code.csv with 4 rows

Processing complete!
- Exported variable list with 36 variables


## Enrollment data

In [7]:
import pandas as pd
from pathlib import Path

# Configuration
EXCEL_FILE = '/Users/uyennguyen/Projects/DA/Data-Viz-IPEDS/dbt-preprocessing/raw/effy2023dict.xlsx'
OUT_DIR = Path("seeds")
OUT_DIR.mkdir(exist_ok=True)

print(f"Processing {EXCEL_FILE}")
print(f"Output directory: {OUT_DIR}")

Processing /Users/uyennguyen/Projects/DA/Data-Viz-IPEDS/dbt-preprocessing/raw/effy2023dict.xlsx
Output directory: seeds


In [10]:
# Execute the data processing
varlist = export_varlist_columns(EXCEL_FILE, OUT_DIR)
exported_count = export_frequency_codes(EXCEL_FILE, OUT_DIR)

print(f"\nProcessing complete!")
print(f"- Exported variable list with {len(varlist)} variables")
print(f"- Exported {exported_count} code mapping files")

Exported varlist_columns.csv with 38 rows
Exported: effyalev_code.csv with 27 rows
Exported: effylev_code.csv with 4 rows
Exported: lstudy_code.csv with 3 rows

Processing complete!
- Exported variable list with 38 variables
- Exported 3 code mapping files
