## Step 0: Understand the Data
Get the info of errors and missing values

In [1]:
!pip install numpy pandas matplotlib scikit-learn seaborn

import pandas as pd
import numpy as np
s = pd.read_csv("Public_School_Characteristics_2022-23.csv")
s_b = s.copy()
s.shape

Defaulting to user installation because normal site-packages is not writeable


  s = pd.read_csv("Public_School_Characteristics_2022-23.csv")


(101390, 77)

### Pattern for each column
TOTAL = TOTMENROL + TOTFENROL
TOTAL = PK + KG + G01 + G02 + G03 + G04 + G05
      + G06 + G07 + G08
      + G09 + G10 + G11 + G12
      + UG + AE
      
TOTAL = AM + AS + HI + BL + WH + HP + TR

Lunch:
TOTFRL = FRELCH + REDLCH

STUTERATIO = TOTAL / FTE

Elementary Schools:
G09 + G10 + G11 + G12 ≈ 0

High Schools:
PK + KG + G01–G08 ≈ 0

AMALM + AMALF = AM

ASALM + ASALF = AS

BLALM + BLALF = BL

HPALM + HPALF = HP

HIALM + HIALF = HI

TRALM + TRALF = TR

WHALM + WHALF = WH

### Missing Value info

In [2]:
special_values = [-1, -2, -9, "M", "N"]
s = s.replace(r'^\s*$', np.nan, regex=True) # Find any string that is completely empty ('') or only spaces (' '), replace it with np.nan

rows_with_special = s.isin(special_values).any(axis=1).sum()
print(rows_with_special)
rows_with_missing_value = s.isna().any(axis=1).sum()
print(rows_with_missing_value)
total_missing = s.isna().sum().sum()
print(total_missing)

62134
101390
1352278


## Step 1: Data Cleaning
### 1.1 Handling missing 0's for Students Each Grade

In [3]:

cols = s.loc[:, 'PK':'AE'].columns  # all columns from PK to AE

# 1️⃣ Row-wise sum of PK to AE (ignores NaN by default)
row_sum = s[cols].sum(axis=1)

# 2️⃣ Find rows where sum equals TOTAL
mask = row_sum.eq(s['TOTAL'])

# 3️⃣ Fill NaN with 0 only for those rows
s.loc[mask, cols] = s.loc[mask, cols].fillna(0)
print(s.isna().any(axis=1).sum())
print(s.isna().sum().sum())

101244
238819


### 2.2 Removing Redundant Rows and Features

#### Dropping Rows and Features with Too less value

In [4]:
# reference for rows missing value threshold
s_b.loc[49:55,:].isna().mean(axis=1)
for col in ["CHARTER_TEXT", "STATUS"]:
    if col in s.columns:
        s = s.drop(columns=[col])

In [5]:
s = s[s.isna().mean(axis=1) <= 0.55]
print(s.isna().any(axis=1).sum())
print(s.isna().sum().sum())
print(s.isna().sum().sort_values(ascending=False))

99573
164801
LSTREET2          99215
LZIP4             41866
FTE                2565
STUTERATIO         1814
HPALM               937
                  ...  
SY_STATUS_TEXT        0
TOTAL                 0
MEMBER                0
LATCOD                0
LONCOD                0
Length: 75, dtype: int64


In [6]:
s = s.loc[:, s.isna().mean() <= 0.40]
print(s.isna().any(axis=1).sum())
print(s.isna().sum().sum())
print(s.isin(special_values).any(axis=1).sum())

5399
23720
59288


In [7]:
s.to_csv("Deliverable2.csv", index=False)

### 2.3 Imputate Continuous Values

In [8]:
# NCES placeholder codes (-1, -2, -9) are non observations (missing / not applicable / low quality),
# so they shouldn't be used as real numeric values in statistics and be replaced.

placeholder_codes = [-1, -2, -9, "M", "N"]

# Replace placeholder codes with NaN (only for columns where codes appear)
for c in ["STUTERATIO", "TOTFRL", "FRELCH", "REDLCH", "DIRECTCERT"]:
    if c in s.columns:
        s[c] = s[c].replace(placeholder_codes, np.nan)

# Ensure numeric columns are numeric
numeric_fix_cols = [
    "TOTAL", "FTE", "STUTERATIO",
    "TOTFRL", "FRELCH", "REDLCH", "DIRECTCERT"
]
for c in numeric_fix_cols:
    if c in s.columns:
        s[c] = pd.to_numeric(s[c], errors="coerce")

print("Missing counts (after placeholder -> NaN):")
print(s[numeric_fix_cols].isna().sum().sort_values(ascending=False))

# TOTAL: If TOTAL is missing, rebuild from grade level columns (PK to AE) when possible.
grade_cols = [c for c in s.columns if c in list(s.loc[:, "PK":"AE"].columns)]
if "TOTAL" in s.columns and grade_cols:
    total_from_grades = s[grade_cols].sum(axis=1, min_count=1)
    missing_total = s["TOTAL"].isna()
    s.loc[missing_total, "TOTAL"] = total_from_grades[missing_total]
    print("Filled TOTAL from grade sums:", int(missing_total.sum()))

# STUTERATIO: If missing, compute using TOTAL and FTE when both exist.
# First, we check that the relationship holds for most records.
if "TOTAL" in s.columns and "FTE" in s.columns and "STUTERATIO" in s.columns:
    s["RATIO_FROM_TOTAL_FTE"] = s["TOTAL"] / s["FTE"]
    s.loc[s["FTE"] == 0, "RATIO_FROM_TOTAL_FTE"] = None
    temp = s.dropna(subset=["STUTERATIO", "RATIO_FROM_TOTAL_FTE"])
    temp["ABS_DIFF"] = (temp["STUTERATIO"] - temp["RATIO_FROM_TOTAL_FTE"]).abs()
    print("mean abs diff =", round(float(temp["ABS_DIFF"].mean()), 4))
    before = s["STUTERATIO"].isna().sum()
    s.loc[s["STUTERATIO"].isna(), "STUTERATIO"] = s["RATIO_FROM_TOTAL_FTE"]
    after = s["STUTERATIO"].isna().sum()
    print("Filled STUTERATIO values:", int(before - after))
    s = s.drop(columns=["RATIO_FROM_TOTAL_FTE"], errors="ignore")
# To verify consistency internally, we compared the STUTERATIO to the computed value TOTAL/FTE. 
# We calculated the mean absolute difference to measure the avg deviation between the two values. 
# A small mean absolute difference shows that the ratios are consistent with actual enrollment and teacher counts.

# FTE: If missing, compute using TOTAL / STUTERATIO.
if "TOTAL" in s.columns and "FTE" in s.columns and "STUTERATIO" in s.columns:
    s["FTE_FROM_RATIO"] = s["TOTAL"] / s["STUTERATIO"]
    s.loc[s["STUTERATIO"] == 0, "FTE_FROM_RATIO"] = None
    before = s["FTE"].isna().sum()
    s.loc[s["FTE"].isna(), "FTE"] = s["FTE_FROM_RATIO"]
    after = s["FTE"].isna().sum()
    print("Filled FTE values:", int(before - after))
    s = s.drop(columns=["FTE_FROM_RATIO"], errors="ignore")

# Remaining missing in continuous columns: use group wise median since less sensitive to outliers
# Group by SCHOOL_LEVEL and STABR if available because school staffing and enrollment patterns are different depending on level and state.
for c in ["STUTERATIO", "FTE"]:
    if c in s.columns:
        # If both grouping columns exist, fill by SCHOOL_LEVEL and STABR
        if "SCHOOL_LEVEL" in s.columns and "STABR" in s.columns:
            for level in s["SCHOOL_LEVEL"].dropna().unique():
                for state in s["STABR"].dropna().unique():
                    # rows in this level and this state
                    condition = (s["SCHOOL_LEVEL"] == level) & (s["STABR"] == state)
                    group_values = s.loc[condition, c]
                    # calculate median if the group has at least one proper value
                    if group_values.notna().sum() > 0:
                        group_median = group_values.median()
                        # fill only missing values inside this group
                        s.loc[condition & s[c].isna(), c] = group_median
        # Otherwise, fill it with overall median
        else:
            overall_median = s[c].median()
            s.loc[s[c].isna(), c] = overall_median
# Remaining missing values in continuous variables (STUTERATIO and FTE) were imputed using group-wise median imputation. 
# Schools were grouped by SCHOOL_LEVEL and STABR because staffing and enrollment differ across different school types and states. 
# Median was chosen instead of mean to minimize sensitivity to extreme outliers in the enrollment and staffing counters. 
# This helps to uphold differences in the structure of the dataset but preventing too much data loss from deleting rows. 
# If grouping variables were unavailable, overall median imputation was used.

print("\nMissing counts after continuous imputations:")
print(s[["TOTAL", "FTE", "STUTERATIO"]].isna().sum())

Missing counts (after placeholder -> NaN):
DIRECTCERT    49544
FRELCH        18956
REDLCH        18956
TOTFRL        13094
STUTERATIO     4379
FTE            2565
TOTAL             0
dtype: int64
Filled TOTAL from grade sums: 0
mean abs diff = 0.0063
Filled STUTERATIO values: 0
Filled FTE values: 0

Missing counts after continuous imputations:
TOTAL          0
FTE            4
STUTERATIO    51
dtype: int64


### 2.4 Imputate Discrete Values

In [9]:
import re

# Fix postal codes: if LZIP is numeric, leading zeros are lost in python. This converts them back to 5 character string.
if "LZIP" in s.columns:
    s["LZIP"] = s["LZIP"].astype("Int64").astype(str).str.zfill(5)

# Address field: if missing, fill with 'Unknown'
for c in ["LSTREET1", "PHONE"]:
    if c in s.columns:
        s[c] = s[c].fillna("Unknown")
# Missing values in categorical variables not included in the analysis such as street address and phone number were replaced 
# with the placeholder value "Unknown". These variables are descriptive identifiers and are not used for analysis. 
# Replacing missing values prevents needing to delete rows unnecessarily due to empty values and ensures consistency in the dataset.

# Free/Reduced lunch fields: these are count fields and can be 0 for schools with no participants.
# We used enrollment percentage as a consideration to avoid producing impossible counts.
if "TOTFRL" in s.columns and "TOTAL" in s.columns:
    # Create FRL_PCT (only when TOTAL > 0)
    s["FRL_PCT"] = s["TOTFRL"] / s["TOTAL"]
    s.loc[s["TOTAL"] == 0, "FRL_PCT"] = None
    # Fill missing FRL_PCT with overall median
    frl_median = s["FRL_PCT"].median()
    s.loc[s["FRL_PCT"].isna(), "FRL_PCT"] = frl_median
    # missing TOTFRL using FRL_PCT * TOTAL
    s.loc[s["TOTFRL"].isna(), "TOTFRL"] = (s["FRL_PCT"] * s["TOTAL"]).round()
    # keep TOTFRL within valid range [0, TOTAL]
    s.loc[s["TOTFRL"] < 0, "TOTFRL"] = 0
    s.loc[s["TOTFRL"] > s["TOTAL"], "TOTFRL"] = s["TOTAL"]
# if FRELCH and REDLCH exist, fill missing by splitting TOTFRL 50/50
if "FRELCH" in s.columns and "REDLCH" in s.columns and "TOTFRL" in s.columns:
    # If missing, assign half of TOTFRL to free lunch
    s.loc[s["FRELCH"].isna(), "FRELCH"] = (0.5 * s["TOTFRL"]).round()
    # Reduced lunch is whatever is left
    s.loc[s["REDLCH"].isna(), "REDLCH"] = (s["TOTFRL"] - s["FRELCH"]).round()
    # Prevent negatives
    s.loc[s["REDLCH"] < 0, "REDLCH"] = 0
# The Free/Reduced Lunch variables are count fields, so they must be not negative and not greater than total enrollment.
# To handle missing values first, we converted the total FRL count into a percentage of enrollment so that any imputed values 
# would scale properly with school size. Missing percentages were filled using the median to avoid being influenced by extreme values. 
# Then we rebuilt missing FRL counts using the percentage and total enrollment, rounding to keep whole numbers. 
# More checks made sure values stayed between 0 and total enrollment. When the free and reduced breakdown was missing,
# we used a 50/50 split to maintain consistency. These steps ensured logical modification while keeping as much data as possible.

# Ensure counts are non-negative
count_cols = ["TOTAL", "TOTFRL", "FRELCH", "REDLCH", "DIRECTCERT", "TOTMENROL", "TOTFENROL", "MEMBER"]

for c in count_cols:
    if c in s.columns:
        s[c] = pd.to_numeric(s[c], errors="coerce")
        s.loc[s[c] < 0, c] = 0

print("Done discrete repairs. Remaining missing values (top 15):")
print(s.isna().sum().sort_values(ascending=False).head(15))


Done discrete repairs. Remaining missing values (top 15):
DIRECTCERT    49544
HPALM           937
HPALF           936
AMALM           910
AMALF           908
HP              890
AM              862
BLALF           826
BLALM           823
ASALM           821
ASALF           819
BL              816
TRALM           816
TRALF           814
TR              813
dtype: int64


In [10]:
# Handling DIRECTCERT
## make sure to convert all to numeric
s['DIRECTCERT'] = pd.to_numeric(s['DIRECTCERT'], errors='coerce')
s['FRELCH'] = pd.to_numeric(s['FRELCH'], errors='coerce')

## FRELCH = Free lunch eligible, is high related to DIRECTCERT, so we use it to give an estimation of DIRECTCERT
## get the median ratio of "DIRECTVERT"/"FRELCH"
D_ratio = s['DIRECTCERT'] / s['FRELCH']
D_median_ratio = D_ratio[
    (s['DIRECTCERT'].notna()) & 
    (s['FRELCH'].notna()) &
    (s['FRELCH'] != 0)
].median()

## give missing value of missing DIRECTCERT only if FRELCH exists
D_mask = s['DIRECTCERT'].isna() & s['FRELCH'].notna()
s.loc[D_mask, 'DIRECTCERT'] = s.loc[D_mask, 'FRELCH'] * D_median_ratio

# Handling CHARTER_TEXT
## the value is mostly NO, so check the constant extent
#from sklearn.feature_selection import VarianceThreshold
#constant_threshold = 0.2
#s['CHARTER_TEXT'] = s['CHARTER_TEXT'].map({
#    'Yes': 1,
#    'No': 0
#})
#num_df = s.select_dtypes(include='number')
#selector = VarianceThreshold(constant_threshold)
#selector.fit(num_df)
#kept_cols = num_df.columns[selector.get_support()]
#constant_cols = num_df.columns[~selector.get_support()]
#print("Constant columns:", list(constant_cols))
# it's constant so will drop the feature at the step of handling constant feature

In [11]:
# Cleaning up the races value
# Goal: ensure totals are internally consistent and remove/repair impossible records.

race_cols = ["AM","AS","BL","HI","HP","TR","WH"]
race_cols = [c for c in race_cols if c in s.columns]

# Convert to numeric
for c in race_cols:
    s[c] = pd.to_numeric(s[c], errors="coerce")

# Race counts: if ALL race columns are missing, ethnicity composition cannot be computed.
# Since our problem statement uses ethnicity composition, we drop these rows.
all_race_missing = s[race_cols].isna().all(axis=1) if len(race_cols) > 0 else pd.Series(False, index=s.index)
print("Rows with all race counts missing:", int(all_race_missing.sum()))
s = s.loc[~all_race_missing].copy()

# If some race categories are missing but the known categories already sum to TOTAL, then missing categories must be 0.
if "TOTAL" in s.columns:

    race_sum_known = s[race_cols].sum(axis=1, min_count=1)

    # Loop through each row
    race_na_any = s[race_cols].isna().any(axis=1)
    for i in s.index:
        if (race_na_any.at[i] and pd.notna(s.at[i, "TOTAL"]) and race_sum_known.at[i] == s.at[i, "TOTAL"]):
            s.loc[i, race_cols] = s.loc[i, race_cols].fillna(0)

    # If exactly ONE race category is missing and TOTAL is known, fill the missing one as the remainder.
    missing_counts = s[race_cols].isna().sum(axis=1)
    one_missing = (missing_counts == 1) & s["TOTAL"].notna()
    remainder = s["TOTAL"] - s[race_cols].sum(axis=1, min_count=1)

    # Identify which column is missing per row and fill it
    for c in race_cols:
        mask = one_missing & s[c].isna() & (remainder >= 0)
        s.loc[mask, c] = remainder[mask]

    # Any remaining negative remainder means race totals exceed TOTAL (inconsistent).
    ##remainder_after = s["TOTAL"] - s[race_cols].sum(axis=1, min_count=1)
    ##inconsistent_race = remainder_after != 0
    
    ##s = s.loc[~inconsistent_race].copy()
    remainder_after = s["TOTAL"] - s[race_cols].sum(axis=1, min_count=1)
    s['TOTAL_RACE_CONS'] = remainder_after == 0  # True if sum equals TOTAL, False otherwise
    inconsistent_race = remainder_after != 0
    
    inconsistent_mask = ~s['TOTAL_RACE_CONS']  # rows where sum != TOTAL
    cols_to_fill = s.loc[:, 'AMALM':'WH'].columns  # columns from AMALM to WH
    s.loc[inconsistent_mask, cols_to_fill] = s.loc[inconsistent_mask, cols_to_fill].fillna(0)
    
    print("Rows where race sum not equal TOTAL (flagged):", int(inconsistent_race.sum()))
    
# Gender totals vs TOTAL: if TOTMENROL + TOTFENROL != TOTAL, we keep as is but flag for awareness.
if "TOTMENROL" in s.columns and "TOTFENROL" in s.columns and "TOTAL" in s.columns:
    condition = (s["TOTMENROL"].notna() & s["TOTFENROL"].notna() & s["TOTAL"].notna())
    # Among those rows, check if male + female does NOT equal TOTAL
    inconsistent = s.loc[condition, "TOTMENROL"] + s.loc[condition, "TOTFENROL"] != s.loc[condition, "TOTAL"]
    print("Rows where male+female != TOTAL:", int(inconsistent.sum()))
    
# Final missing snapshot
print("\nRemaining missing values (top 15):")
print(s.isna().sum().sort_values(ascending=False).head(15))


Rows with all race counts missing: 809
Rows where race sum not equal TOTAL (flagged): 5900
Rows where male+female != TOTAL: 5978

Remaining missing values (top 15):
HPALF         61
HPALM         60
STUTERATIO    49
AMALM         41
AMALF         36
G13            5
BLALF          5
UG             5
AE             5
ASALM          5
G10            4
G12            4
G11            4
BLALM          4
G09            4
dtype: int64


In [12]:
grade_cols = s.loc[:, 'PK':'AE'].columns  # columns from PK to AE
remainder_after_grades = s[grade_cols].sum(axis=1, min_count=1) - s['TOTAL']  # or whatever your total column is for grades
s['TOTAL_GRADE_CONS'] = remainder_after_grades == 0  # True if sum equals TOTAL, False otherwise
inconsistent_grade = remainder_after_grades != 0

# --- Step 2: Fill NaN with 0 for inconsistent rows only ---
inconsistent_grade_mask = ~s['TOTAL_GRADE_CONS']  # rows where sum != TOTAL
s.loc[inconsistent_grade_mask, grade_cols] = s.loc[inconsistent_grade_mask, grade_cols].fillna(0)

# --- Step 3: Print summary ---
print("Rows where grade sum not equal TOTAL (flagged):", int(inconsistent_grade.sum()))

Rows where grade sum not equal TOTAL (flagged): 5


In [13]:
# Handling the rest of race values
## for race_cols = ["AM","AS","BL","HI","HP","TR","WH"], 
## get the median of the genders of each race and replace the NAN value if the race_cols are not NAN
for race in race_cols:
    total_col = race
    male_col = f"{race}ALM"
    female_col = f"{race}ALF"
    
    # --- MALE IMPUTATION ---
    male_ratio = s[male_col] / s[total_col]
    
    male_median_ratio = male_ratio[
        (s[male_col].notna()) &
        (s[total_col].notna()) &
        (s[total_col] != 0)
    ].median()
    
    male_mask = s[male_col].isna() & s[total_col].notna()
    s.loc[male_mask, male_col] = (s.loc[male_mask, total_col] * male_median_ratio).round().astype(int)
    
    # --- FEMALE IMPUTATION ---
    female_ratio = s[female_col] / s[total_col]
    
    female_median_ratio = female_ratio[
        (s[female_col].notna()) &
        (s[total_col].notna()) &
        (s[total_col] != 0)
    ].median()
    
    female_mask = s[female_col].isna() & s[total_col].notna()
    s.loc[female_mask, female_col] = (s.loc[female_mask, total_col] * female_median_ratio).round().astype(int)
    
    # --- ADJUST TO ENSURE MALE + FEMALE = TOTAL ---
    # For rows where total is known and one or both genders were imputed
    adjust_mask = s[total_col].notna() & s[male_col].notna() & s[female_col].notna()
    difference = s.loc[adjust_mask, total_col] - (s.loc[adjust_mask, male_col] + s.loc[adjust_mask, female_col])
    
    # Add/subtract difference to female to balance exactly
    s.loc[adjust_mask, female_col] += difference.astype(int)
    
print(s.isna().sum().sort_values(ascending=False).head(10))

STUTERATIO    49
FTE            2
X              0
Y              0
SURVYEAR       0
STABR          0
LEAID          0
ST_LEAID       0
LEA_NAME       0
SCH_NAME       0
dtype: int64


In [14]:
# replace the rest NAN STUTERATIO according to TOTAL and FTE
## Fill in the rest of the missing FTE

## fill in the STUTERATIO
mask_fte_known = s['STUTERATIO'].isna() & s['FTE'].notna()
s.loc[mask_fte_known, 'STUTERATIO'] = s.loc[mask_fte_known].apply(
    lambda row: row['TOTAL'] / row['FTE'] if row['FTE'] != 0 else 0,
    axis=1
)

# Optional: round STUTERATIO
s['STUTERATIO'] = s['STUTERATIO'].round(2)

# --- Step 2: Fill missing STUTERATIO using group median ---
mask_fte_missing = s['STUTERATIO'].isna()
group_median_stu = s.groupby(['STABR', 'SCHOOL_LEVEL'])['STUTERATIO'].transform('median')
s.loc[mask_fte_missing, 'STUTERATIO'] = s.loc[mask_fte_missing].apply(
    lambda row: group_median_stu[row.name] if not pd.isna(group_median_stu[row.name]) else 0,
    axis=1
)

# Optional: round STUTERATIO
s['STUTERATIO'] = s['STUTERATIO'].round(2)

# --- Step 3: Compute missing FTE from STUTERATIO ---
mask_fte_nan = s['FTE'].isna() & s['STUTERATIO'].notna()
s.loc[mask_fte_nan, 'FTE'] = s.loc[mask_fte_nan].apply(
    lambda row: row['TOTAL'] / row['STUTERATIO'] if row['STUTERATIO'] != 0 else 0,
    axis=1
)

# Optional: round FTE
s['FTE'] = s['FTE'].round(2)
print(s.isna().sum().sort_values(ascending=False).head(15))
s.loc[s["G13"].isna(), "PK":"TOTAL"]
# s.loc[s["HP"].isna(), "TOTAL":"WH"]

X           0
Y           0
OBJECTID    0
NCESSCH     0
SURVYEAR    0
STABR       0
LEAID       0
ST_LEAID    0
LEA_NAME    0
SCH_NAME    0
LSTREET1    0
LCITY       0
LSTATE      0
LZIP        0
PHONE       0
dtype: int64


Unnamed: 0,PK,KG,G01,G02,G03,G04,G05,G06,G07,G08,G09,G10,G11,G12,G13,UG,AE,TOTMENROL,TOTFENROL,TOTAL


### 2.5 Handling Edge Cases¶

In [15]:
# Calculated using box plot bounds derived from IQR
q1 = s["STUTERATIO"].quantile(0.25)
q3 = s["STUTERATIO"].quantile(0.75)
iqr = q3 - q1

lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

outliers = s["STUTERATIO"] > upper
print("STUTERATIO outliers removed:", int(outliers.sum()))

s.loc[outliers, "STUTERATIO"] = None

STUTERATIO outliers removed: 2916


### 2.6 Feature Engineering
The features added through the imputation

In [34]:
# Ethnicity composition as proportions (robust to different school sizes)
race_cols = ["AM","AS","BL","HI","HP","TR","WH"]
race_cols = [c for c in race_cols if c in s.columns]

for c in race_cols:
    s[c] = pd.to_numeric(s[c], errors="coerce")

if "TOTAL" in s.columns:
    for c in race_cols:
        s[f"{c}_PCT"] = np.where(s["TOTAL"] > 0, s[c] / s["TOTAL"], np.nan)

    # minority share (everything except White)
    if "WH" in s.columns:
        s["MINORITY_PCT"] = 1 - s["WH_PCT"]

# School level (ordinal encoding) to support correlation
level_map = {
    "Primary": 1,
    "Middle": 2,
    "High": 3,
    "Other": 0
}
if "SCHOOL_LEVEL" in s.columns:
    s["SCHOOL_LEVEL_CODE"] = s["SCHOOL_LEVEL"].map(level_map).fillna(0).astype(int)

# Create FRL_PCT
if "FRL_PCT" not in s.columns and "TOTFRL" in s.columns and "TOTAL" in s.columns:
    s["FRL_PCT"] = s["TOTFRL"] / s["TOTAL"]

# Create MINORITY_PCT
race_cols = ["AM","AS","BL","HI","HP","TR","WH"]
race_cols = [c for c in race_cols if c in s.columns]

if "MINORITY_PCT" not in s.columns and "WH" in s.columns and "TOTAL" in s.columns:
    s["MINORITY_PCT"] = 1 - (s["WH"] / s["TOTAL"])

### 2.7 Dropping Constant Features and Duplicates

In [35]:
from sklearn.feature_selection import VarianceThreshold
# Drop fully-constant columns providing no information for model
constant_threshold = 0.3
num_df = s.select_dtypes(include='number')
selector = VarianceThreshold(constant_threshold)
selector.fit(num_df)
kept_cols = num_df.columns[selector.get_support()]
constant_cols = num_df.columns[~selector.get_support()]
print("Constant columns:", list(constant_cols))
s = s.drop(columns=constant_cols)

# >= 99.5% of rows share the same value
#quasi_constant_cols = []
#for col in s.columns:
#    top_freq = s[col].value_counts(dropna=False, normalize=True).iloc[0]
    # if top_freq >= 0.995:
    #     quasi_constant_cols.append(col)

# print(">=99.5% same:", quasi_constant_cols)

# Remove duplicate records (exact duplicates or duplicate school IDs)
before = s.shape[0]
s = s.drop_duplicates()
after = s.shape[0]
print(f"Exact duplicate rows removed: {before-after}")

if "NCESSCH" in s.columns:
    before = s.shape[0]
    s = s.drop_duplicates(subset=["NCESSCH"])
    after = s.shape[0]
    print(f"Duplicate NCESSCH rows removed: {before-after}")
print(s.isna().sum().sort_values(ascending=False).head(10))

Constant columns: []
Exact duplicate rows removed: 0
STUTERATIO    2916
X                0
OBJECTID         0
Y                0
STABR            0
ST_LEAID         0
LEA_NAME         0
SURVYEAR         0
SCH_NAME         0
LSTREET1         0
dtype: int64


### 2.8 Correlation Analysis

In [36]:
import seaborn as sns
#Using Pearson Correlation

s_corr = s.copy()
if "STUTERATIO" in s_corr.columns:
    s_corr = s_corr.drop(columns=["STUTERATIO"])
    
num_df = s_corr.select_dtypes(include='number')

# Compute correlation
# cor = num_df.corr()

# # Plot heatmap
# plt.figure(figsize=(8,6))
# sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
# plt.show()



def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr
corr_features = correlation(num_df, 0.7)
corr_features

set()

In [37]:
s.drop(corr_features,axis=1,inplace=True)


### 2.9 Data Shape After Cleaning

In [38]:
print("Shape after cleaning:", s.shape)
print("Total missing values remaining:", int(s.isna().sum().sum()))


Shape after cleaning: (98910, 40)
Total missing values remaining: 2916


## Step 3: Distribution

In [48]:
import matplotlib.pyplot as plt

plt.figure()
plt.hist(s["TOTAL"].dropna(), bins=30)
plt.title("Distribution of TOTAL")
plt.xlabel("TOTAL")
plt.ylabel("Frequency")
plt.show()

KeyError: 'TOTAL'

<Figure size 640x480 with 0 Axes>

In [41]:
for col in key_features:
    if col in s.columns:
        plt.figure()
        plt.boxplot(s[col].dropna())
        plt.title(f"Boxplot of {col}")
        plt.ylabel(col)
        plt.show()

In [45]:
if "MINORITY_PCT" in s.columns and "FTE" in s.columns:
    plt.figure()
    plt.scatter(s["MINORITY_PCT"], s["FTE"], alpha=0.3)
    plt.title("FTE vs MINORITY_PCT")
    plt.xlabel("MINORITY_PCT")
    plt.ylabel("FTE")
    plt.show()


In [46]:
if "FRL_PCT" in s.columns and "TOTAL" in s.columns:
    plt.figure()
    plt.scatter(s["FRL_PCT"], s["TOTAL"], alpha=0.3)
    plt.title("TOTAL vs FRL_PCT")
    plt.xlabel("FRL_PCT")
    plt.ylabel("TOTAL")
    plt.show()

## Step 3: Normalization and Transformation

In [None]:
import matplotlib.pyplot as plt

# Transformation (log1p) for heavy-tailed count features to reduce skew

# Standardization (z-score) for continuous features used in modeling

# Select a few representative count features
count_features = [c for c in ["TOTAL","FTE","TOTFRL"] if c in s.columns]
for c in count_features:
    s[f"LOG1P_{c}"] = np.log1p(s[c])

# Standardize continuous features
cont_features = [c for c in ["STUTERATIO","FRL_PCT","MINORITY_PCT","TOTAL","FTE"] if c in s.columns]
for c in cont_features:
    mean = s[c].mean()
    std = s[c].std()
    s[f"Z_{c}"] = (s[c] - mean) / std

# Correlation analysis: which engineered variables correlate with STUTERATIO?


### 3.1 Export Cleaned Dataset


In [None]:
s.to_csv("Deliverable2_cleaned.csv", index=False)
print("Saved Deliverable2_cleaned_final.csv")
