In [1]:
import pandas as pd

In [2]:
def dat_to_csv_converter(dat_file_path):
    # Step 1: Read the .dat file using pandas
    # Assume .dat file is comma seperated
    try:
        data = pd.read_csv(dat_file_path, delimiter=',')
        csv_file_path = dat_file_path[:-4] + ".csv"
        # Step 3: Write the data to a .csv file
        data.to_csv(csv_file_path, index=False)  # Set index=False to omit row indices in the CSV
        return csv_file_path
    except Exception as e:
        print(e)

base_file_path = '../raw_data/'

final_dataframes = []

In [3]:
final_cols = [
 'schoolcode',
 'year',
 'charter',
 'mathpass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

main_dataframe_cols = [
 'schoolcode',
 'year',
 'mathpass',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

def final_data_generator(csv_file_path, year):
    raw_dataframe = pd.read_csv(csv_file_path)
    working_dataframe = raw_dataframe.copy()

    working_dataframe.columns = working_dataframe.columns.str.lower()
    working_dataframe['year'] = year
    working_dataframe['schoolcode'] = working_dataframe['campus']
    working_dataframe['district'] = working_dataframe['district']
    working_dataframe['totaltested'] = working_dataframe['a1_all_d'] 
    working_dataframe['asian'] = working_dataframe['a1_etha_d']
    working_dataframe['black'] = working_dataframe['a1_ethb_d']
    working_dataframe['white'] = working_dataframe['a1_ethw_d']
    working_dataframe['hispanic'] = working_dataframe['a1_ethh_d']
    working_dataframe['lowincome'] = working_dataframe['a1_eco2_d'] + working_dataframe['a1_eco1_d']
    working_dataframe['mathpass'] = working_dataframe['a1_all_meetsgl_nm'] if 'a1_all_meetsgl_nm' in working_dataframe else working_dataframe ["a1_all_satis_rec_nm"]  
    
    working_dataframe['asian'] /= working_dataframe['totaltested']
    working_dataframe['black'] /= working_dataframe['totaltested']
    working_dataframe['white'] /= working_dataframe['totaltested']
    working_dataframe['hispanic'] /= working_dataframe['totaltested']
    working_dataframe['mathpass'] /= working_dataframe['totaltested']
    working_dataframe['lowincome'] /= working_dataframe['totaltested']

    final_dataframe = working_dataframe

    return final_dataframe

In [4]:
# generate csv data from dat files
for year in range(2015, 2023):
    if year == 2020: continue
    current_file_path = base_file_path + "texas_math_" + str(year) + ".dat"
    final_csv_path = dat_to_csv_converter(current_file_path)
    final_dataframes.append(final_data_generator(final_csv_path, year))

final_dataframe = pd.concat(final_dataframes)
final_dataframe = final_dataframe[main_dataframe_cols]

final_dataframe.shape

(24255, 9)

In [None]:
# Find unique years in the DataFrame
unique_years = final_dataframe['year'].unique()

# Function to filter schools with complete data across all years
def filter_complete_data(group):
    group_years = group['year'].unique()
    
    # Check that the school has an entry for each year
    if set(group_years) != set(unique_years):
        return False
    
    # Check that there are no NaN values
    return not group.isna().any().any()

# Group by 'SchoolCode' and filter
final_dataframe = final_dataframe.groupby('schoolcode').filter(filter_complete_data)

final_dataframe.shape

In [None]:
final_dataframe.isna().sum()

In [None]:
final_dataframe.shape

# Import Virtual Data

In [None]:
virtual = pd.read_csv("../raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [None]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

In [None]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['year'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

In [None]:
# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'year', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'year', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

In [None]:
# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] / pivot['TotalDays'])

pivot.columns.name = None

In [None]:
pivot = pivot.drop(columns=['InPerson', "In-person", 'Virtual', 'TotalDays'])

pivot.fillna(0)

# Begin Merge Process

In [None]:
# Find the intersection of unique SchoolCodes in both DataFrames
common_schoolcodes = set(final_dataframe['schoolcode'].unique()).intersection(set(virtual['StateAssignedSchoolID'].unique()))

# Filter both DataFrames to only include these SchoolCodes
final_dataframe = final_dataframe[final_dataframe['schoolcode'].isin(common_schoolcodes)]
virtual = virtual[virtual['StateAssignedSchoolID'].isin(common_schoolcodes)]



In [None]:
final_dataframe.shape

In [None]:
# Merge dropout_df with final_data
export_dataframe = pd.merge(final_dataframe, pivot, left_on=['schoolcode', 'year'], right_on=['StateAssignedSchoolID', 'year'], how='left')


In [None]:
export_dataframe.sort_values(by=['schoolcode', 'year'], inplace=True)

export_dataframe.head(10)


In [None]:
export_dataframe['Charter'] = export_dataframe.groupby('schoolcode')['Charter'].ffill().bfill()
export_dataframe['StateAssignedSchoolID'] = export_dataframe.groupby('schoolcode')['StateAssignedSchoolID'].ffill().bfill()
export_dataframe["virtualper"] = export_dataframe["VirtualPercent"]
export_dataframe["schoolmode"] = export_dataframe["Score"]
export_dataframe["charter"] = export_dataframe["Charter"]
export_dataframe["hybridper"] = 0

export_dataframe = export_dataframe.fillna(0)
export_dataframe = export_dataframe.reset_index(drop=True)
export_dataframe = export_dataframe[final_cols]
export_dataframe.head(10)

In [None]:
export_dataframe.to_csv("../final_data_components/mathpass_texas.csv")