# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import sys

### Step 1
Write a function that processes a single .txt file. It must:
* drop rows that do not contain a unique school identifier.
* drop rows that correspond to elementary/middle school education. We are focusing on high school data.

In [2]:
def read_text_file(textfilepath):
    """
    Input: textfilepath, a path to the text file to be generated to a datafrmae
    Output: Pandas DataFrame corresponding to input text file
    """
    df = pd.read_csv(textfilepath, sep="\t", encoding="ISO-8859-1")
    return df 

In [3]:
df1 = read_text_file("./txt_files/absenteeism_txt_files/2016-17_ChronAbsenteeism.txt")
df2 = read_text_file("./txt_files/absenteeism_txt_files/2017-18_ChronAbsenteeism.txt")

In [4]:
def drop_rows(df):
    """
    Input: Pandas DataFrame
    Output: Pandas DataFrame with (a) rows with no school code and (b) rows corresponding to elementary/
            middle school education removed
    """
    rows_to_drop = []
    #drop rows that do not have unique school code
    for i, code in enumerate(df["SchoolCode"]):
        if pd.isnull(code) or code == 0:
            rows_to_drop.append(i)
    #drop rows that correspond to elementary and middle school data
    for i, reporting_category in enumerate(df["ReportingCategory"]):
        if reporting_category in ["GRKN", "GRK", "GR13", "GR46", "GR78", "GRK8", "GR912", "GRUG"]:
            rows_to_drop.append(i)
    df = df.drop(rows_to_drop,axis=0)
    return df

In [5]:
df1 = drop_rows(df1)
df2 = drop_rows(df2)

### Step 2
We only want one row per school. As of now, the DataFrame has multiple rows per school to give metrics across different reporting groups.

To do this, we will construct of matrix where rows are individual schools and columns are 
["RB", "RI", "RA", "RF", "RD", "RP", "RT", "RW", "GM", "GF", "GX", "GZ", "SE", "SD", "SS", "SM", "SF", "SH", "TA"]

These are each of the reporting categories.

In [6]:
def generate_column_index_mapping(columns):
    mapping = {}
    for i,col in enumerate(columns):
        mapping[col] = i
    return mapping
mapping = generate_column_index_mapping(["RB", "RI", "RA", "RF", "RH", "RD", "RP", "RT", "RW", "GM", "GF", "GX", "GZ", "SE", "SD", "SS", "SM", "SF", "SH", "TA"])

In [7]:
all_columns = ["AcademicYear", "AggregateLevel", "CountryCode", "DistrictCode", "SchoolCode", "CountyName",
          "DistrictName", "SchoolName", "CharterYN", "CAR_RB", "CAR_RI", "CAR_RA", "CAR_RF", "CAR_RH", "CAR_RD",
          "CAR_RP", "CAR_RT", "CAR_RW", "CAR_GM", "CAR_GF", "CAR_GX", "CAR_GZ", "CAR_SE", "CAR_SD", "CAR_SS",
          "CAR_SM", "CAR_SF", "CAR_SH", "CAR_TA"]
reporting_category_columns = ["CAR_RB", "CAR_RI", "CAR_RA", "CAR_RF", "CAR_RH", "CAR_RD",
          "CAR_RP", "CAR_RT", "CAR_RW", "CAR_GM", "CAR_GF", "CAR_GX", "CAR_GZ", "CAR_SE", "CAR_SD", "CAR_SS",
          "CAR_SM", "CAR_SF", "CAR_SH", "CAR_TA"]

In [8]:
def colapse_df(df):
    #determine which rows are associated with each school
    school_codes = df["SchoolCode"].unique()
    associated_rows = {}
    for code in school_codes:
        associated_rows[code] = []
    for i in range(df.shape[0]):
        associated_rows[df["SchoolCode"][df.index[i]]].append(i)
    array = np.resize(np.array(all_columns), (1,len(all_columns)))
    for code in school_codes:
        rows = associated_rows[code]
        firstrow = df.iloc[rows[0]]
        school_data = np.array([[firstrow["AcademicYear"], firstrow["AggregateLevel"], firstrow["CountyCode"], 
                               firstrow["DistrictCode"], firstrow["SchoolCode"], firstrow["CountyName"],
                               firstrow["DistrictName"], firstrow["SchoolName"], firstrow["CharterYN"],
                                       None,None,None,None,None,None,None,None,None,None,None,None,None,
                                       None,None,None,None,None,None,None]])
        for row in rows:
            school_data[0][9+mapping[df.iloc[row]["ReportingCategory"]]] = df.iloc[row]["ChronicAbsenteeismRate"]
        array = np.append(array,school_data,axis=0)
    return pd.DataFrame(array[1:,:], columns=all_columns)

In [9]:
new_df1 = colapse_df(df1)
new_df2 = colapse_df(df2)

In [10]:
new_df1

Unnamed: 0,AcademicYear,AggregateLevel,CountryCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterYN,CAR_RB,...,CAR_GF,CAR_GX,CAR_GZ,CAR_SE,CAR_SD,CAR_SS,CAR_SM,CAR_SF,CAR_SH,CAR_TA
0,2016-17,S,1,10017,112607,Alameda,Alameda County Office of Education,Envision Academy for Arts & Technology,All,23.5,...,26.2,,,22.9,19.6,24.3,,,,23.9
1,2016-17,S,1,10017,123968,Alameda,Alameda County Office of Education,Community School for Creative Education,All,27.6,...,17.3,,,16.1,14.7,17.7,,,,19.8
2,2016-17,S,1,10017,124172,Alameda,Alameda County Office of Education,Yu Ming Charter,All,21.4,...,5.6,,,4.3,4.3,12.1,,,,4.2
3,2016-17,S,1,10017,125567,Alameda,Alameda County Office of Education,Urban Montessori Charter,All,12.7,...,13.5,,,14.7,15.6,17.7,,,,12.9
4,2016-17,S,1,10017,130401,Alameda,Alameda County Office of Education,Alameda County Juvenile Hall/Court,All,7,...,10,,,9.5,11.9,8.8,,11.9,8,8.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10012,2016-17,S,58,72751,6.05683e+06,Yuba,Wheatland,Lone Tree Elementary,All,2.6,...,6.8,,,7.7,9.4,7.5,,,,5.5
10013,2016-17,S,58,72751,6.05684e+06,Yuba,Wheatland,Wheatland Elementary,All,,...,6.9,,,3,7.7,10.5,,,27.3,7.1
10014,2016-17,S,58,72751,6.11881e+06,Yuba,Wheatland,Wheatland Charter Academy,All,,...,1.9,,,,7.1,1.9,,,,2.9
10015,2016-17,S,58,72769,123570,Yuba,Wheatland Union High,Wheatland Community Day High,All,,...,,,,,,,,,,55.6


### Step 3
Write a function that generates a single DataFrame given multiple dataframes from different time periods of the same category. The resultant DataFrame should organize each school's data in chronological order.