In [42]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Additional Preprocessing

The format of the data for an RNN or HMM is: num_samples $\times$ time_series_length $\times$ num_features. 

In our case, this means that the format should be:

number of schools $\times$ 3 years of data $\times$ number of school features

Since our current preprocessed dataframe is 2-dimensional, we will need to do some additional preprocessing to ensure that our data is of the correct shape. 

In [43]:
df = pd.read_csv("./preprocessed_data/2016-19ChronAbsenteeism.csv", sep=",")
df = df.drop(["Unnamed: 0"], axis=1)

In [44]:
def get_indices_where_new_schools_start(df):
    """
    Returns a list of the row indices for which new schools' data starts. For example, if
    data for school 1 takes up rows 0-2, data for school 2 takes up rows 3-4, and data for 
    school 3 takes up rows 5-7, then this function would return this list [0,3,5].
    Input:
    df: merged dataframe output from Preprocessing script
    Output:
    new_school_indices: Python list of row indices for which a new schools' data starts
    """
    schools = df["SchoolCode"] 
    i, new_school_indices = 0, []
    current_school = schools[i]
    while i < df.shape[0]-1: 
        while i < df.shape[0]-1 and current_school == schools[i]:
            i += 1
        # inner while loop terminates whenever a new school is hit, so we append that index 
        # and reset the current_school
        new_school_indices.append(i)
        current_school = schools[i]
    return new_school_indices

In [45]:
new_school_indices = get_indices_where_new_schools_start(df)

In [46]:
def stack_data(df, time_series_length):
    """
    Stacks a 2-dimensional data frame into a 3-dimensional NumPy array of shape:
    num_schools x time_series_length x num_features. Uses get_indices_where_new_schools_start()
    above to determine how to divide input df.
    Inputs:
    df: merged dataframe output from Preprocessing script
    time_series_length: the number of years of data we have for each school 
    Outputs:
    master_data: 3-dimensional NumPy array with shape described above
    """
    array = df.to_numpy() #convert input dataframe to numpy array 
    new_school_indices = get_indices_where_new_schools_start(df) #get output of get_indices_where_new_schools_start()
    for i, idx in enumerate(new_school_indices): 
        if i < len(new_school_indices)-1: #check to avoid indexing errors 
            #rows_to_fill = number of rows that need to be filled with dummy data 
            #if we have time_series_length years of data for some school, then rows_to_fill = 0
            rows_to_fill = time_series_length - (new_school_indices[i+1] - idx)
s            if rows_to_fill > 0:
                fill_data = np.zeros((rows_to_fill,sub_df.shape[1])) #create fill_data to fill missing data
                sub_df = np.vstack([sub_df, fill_data]) #stack real data with fill data
            if i == 0: #for first iteration of for loop, need to initialize master_data
                master_data = np.expand_dims(sub_df,axis=0)
            else: #for all other iterations, we need to concatenate the next schools data
                master_data = np.concatenate((master_data,np.expand_dims(sub_df,axis=0)))
    return master_data

In [50]:
master = stack_data(df,3)

In [51]:
print(master.shape)

(2241, 3, 29)


In [52]:
master.shape[0]

2241

As we can see, the shape of the data above is what we desired. We have data for 2241 schools, 3 timesteps (2016, 2017, and 2018). We have 29 school features for each year.