In [1]:
import pandas as pd
import numpy as np
import sys

In [3]:
def read_text_file(textfilepath):
    """
    Input: textfilepath, a path to the text file to be generated to a datafrmae
    Output: Pandas DataFrame corresponding to input text file
    """
    df = pd.read_csv(textfilepath, sep="\t", encoding="ISO-8859-1")
    return df

In [4]:
df2016 = read_text_file("./txt_files/Enrollment_txt_files/2016-17_Enrollment.txt")
df2017 = read_text_file("./txt_files/Enrollment_txt_files/2017-18_Enrollment.txt")
df2018 = read_text_file("./txt_files/Enrollment_txt_files/2018-19_Enrollment.txt")

In [5]:
df2016

Unnamed: 0,CDS_CODE,COUNTY,DISTRICT,SCHOOL,ETHNIC,GENDER,KDGN,GR_1,GR_2,GR_3,...,GR_7,GR_8,UNGR_ELM,GR_9,GR_10,GR_11,GR_12,UNGR_SEC,ENR_TOTAL,ADULT
0,4755070433946,Butte,Gridley Unified,Esperanza High (Continuation),0,F,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,4755070433946,Butte,Gridley Unified,Esperanza High (Continuation),5,F,0,0,0,0,...,0,0,0,0,0,4,4,0,8,0
2,4755070433946,Butte,Gridley Unified,Esperanza High (Continuation),7,F,0,0,0,0,...,0,0,0,0,0,1,2,0,3,0
3,4755070433946,Butte,Gridley Unified,Esperanza High (Continuation),6,M,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,4755070433946,Butte,Gridley Unified,Esperanza High (Continuation),5,M,0,0,0,0,...,0,0,0,0,1,1,7,0,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129808,36750443630407,San Bernardino,Hesperia Unified,Hesperia High,9,F,0,0,0,0,...,0,0,0,1,1,0,0,0,2,0
129809,36750443630407,San Bernardino,Hesperia Unified,Hesperia High,7,F,0,0,0,0,...,0,0,0,28,25,21,26,0,100,0
129810,36750443630407,San Bernardino,Hesperia Unified,Hesperia High,2,M,0,0,0,0,...,0,0,0,3,2,4,2,0,11,0
129811,36750443630407,San Bernardino,Hesperia Unified,Hesperia High,6,M,0,0,0,0,...,0,0,0,17,10,10,11,0,48,0


Preprocessing steps:
- Determine male/female total enrollement for each school
- Determine total enrollment by race
    + Code 0 = Not reported
    + Code 1 = American Indian or Alaska Native, Not Hispanic
    + Code 2 = Asian, Not Hispanic
    + Code 3 = Pacific Islander, Not Hispanic
    + Code 4 = Filipino, Not Hispanic
    + Code 5 = Hispanic or Latino
    + Code 6 = African American, not Hispanic
    + Code 7 = White, not Hispanic
    + Code 9 = Two or More Races, Not Hispanic
- Drop non-high schools 
- Produce 1 row of data for each school with fields like "ENR_F", "ENR_M", "ENR_0", "ENR_1", etc.

In [35]:
#exract school codes 
def get_unique_school_codes(df):
    CDS_codes = df["CDS_CODE"]
    school_codes = []
    for code in df["CDS_CODE"]:
        school_codes.append(int(str(code)[-7:])) #school code is last 7 digits of CDS_CODE
    df["School_code"] = school_codes
    school_codes = list(set(school_codes)) #remove duplicates
    return school_codes

codes = get_unique_school_codes(df2016)

#drop rows with invalid school codes
def drop_rows(df):
    """
    Input: Pandas DataFrame
    Output: Pandas DataFrame with (a) rows with no school code and (b) rows corresponding to elementary/
            middle school education removed
    """
    rows_to_drop = []
    #drop rows that do not have unique school code
    for i, code in enumerate(df["School_code"]):
        if pd.isnull(code) or code in [0,1]:
            rows_to_drop.append(i)
    df = df.drop(rows_to_drop,axis=0)
    return df

df2016 = drop_rows(df2016)

#get associated rows for each school code 
def determine_school_row_associations(df):
    school_codes = df["School_code"].unique()
    associated_rows = {}
    for code in school_codes:
        associated_rows[code] = []
    for i in range(df.shape[0]):
        associated_rows[df["School_code"][df.index[i]]].append(df.index[i])
    return associated_rows

associated_rows = determine_school_row_associations(df2016)

#drop schools with non-high school enrollment 
def drop_little_ones(df):
    associated_rows = determine_school_row_associations(df)
    little_schools = []
    for school in associated_rows:
        for row in associated_rows[school]:
            for col in ["KDGN", "GR_1", "GR_2", "GR_3", "GR_4", "GR_5", "GR_6", "GR_7", "GR_8"]:
                if df[col][row] != 0: #if there is non-zero enrollment in non-high school grades
                    little_schools.append(school)
                    break
    little_schools = list(set(little_schools))
    
    rows_to_drop = []
    for i in range(df.shape[0]):
        if df["School_code"][df.index[i]] in little_schools:
            rows_to_drop.append(df.index[i])
    df = df.drop(rows_to_drop,axis=0)
    return df

new_df2016 = drop_little_ones(df2016)

#get male/female total enrollment by summing ENR_TOTAL category for rows which gender is specified M/F respectively
#get racial total enrollment by summing ENR_TOTAL category for rows which ETHNIC is specified 0,1,2,...