In [1]:
import pandas as pd

# Load the raw EDFacts LEA file (long format)
input_path = "SY1018_FS150_FS151_DG695_DG696_LEA.csv"

df = pd.read_csv(
    input_path,
    low_memory=False
)

print("Raw shape:", df.shape)

# Keep only Adjusted Cohort Graduation Rate rows
df = df[
    df["Data Description"] == "Four-Year Adjusted-Cohort Graduation Rates"
].copy()

# Rename and clean columns
df = df.rename(columns={
    "NCES LEA ID": "LEAID"
})

# Graduation rate is stored directly in Value
df["grad_rate"] = pd.to_numeric(df["Value"], errors="coerce")

# Drop unusable values
df = df.dropna(subset=["grad_rate"])
df = df[(df["grad_rate"] >= 0) & (df["grad_rate"] <= 100)]

print("After filtering ACGR rows:", df.shape)

# Deduplicate: one graduation rate per LEA
# Multiple rows per LEA exist due to reporting categories.
# Take the mean graduation rate per LEA.
df_lea = (
    df.groupby("LEAID", as_index=False)
      .agg({"grad_rate": "mean"})
)

print("Final LEA-level shape:", df_lea.shape)

# Save final clean file
output_path = "edfacts_lea_graduation_2017_18_final.csv"
df_lea.to_csv(output_path, index=False)

print(f"Saved cleaned graduation file to {output_path}")
print(df_lea.head())


Raw shape: (706595, 18)
After filtering ACGR rows: (59832, 19)
Final LEA-level shape: (4246, 2)
Saved cleaned graduation file to edfacts_lea_graduation_2017_18_final.csv
    LEAID  grad_rate
0  100005  88.625000
1  100006  85.750000
2  100007  93.052632
3  100008  95.750000
4  100013  95.400000
