In [1]:
import pandas as pd
import os

In [2]:
def dat_to_csv_converter(dat_file_path):
    # Step 1: Read the .dat file using pandas
    # Assume .dat file is comma seperated
    try:
        data = pd.read_csv(dat_file_path, delimiter=',', on_bad_lines='skip')
        csv_file_path = dat_file_path[:-4] + ".csv"
        # Step 3: Write the data to a .csv file
        data.to_csv(csv_file_path, index=False)  # Set index=False to omit row indices in the CSV
        return csv_file_path
    except Exception as e:
        print(e)

base_file_path = '../raw_data/'

final_dataframes = []

In [3]:
final_cols = [
 'schoolcode',
 'year',
 'charter',
 'mathpass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

main_dataframe_cols = [
 'schoolcode',
 'year',
 'mathpass',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

def final_data_generator(csv_file_path, year):
    raw_dataframe = pd.read_csv(csv_file_path)
    working_dataframe = raw_dataframe.copy()

    working_dataframe.columns = working_dataframe.columns.str.lower()
    working_dataframe['year'] = year
    working_dataframe['schoolcode'] = working_dataframe['campus']
    working_dataframe['district'] = working_dataframe['district']
    working_dataframe['totaltested'] = working_dataframe['e1_all_d'] 
    working_dataframe['asian'] = working_dataframe['e1_etha_d']
    working_dataframe['black'] = working_dataframe['e1_ethb_d']
    working_dataframe['white'] = working_dataframe['e1_ethw_d']
    working_dataframe['hispanic'] = working_dataframe['e1_ethh_d']
    working_dataframe['lowincome'] = working_dataframe['e1_eco2_d'] + working_dataframe['e1_eco1_d']
    working_dataframe['mathpass'] = working_dataframe['e1_all_meetsgl_nm'] if 'e1_all_meetsgl_nm' in working_dataframe else working_dataframe ["e1_all_satis_rec_nm"]  
    
    working_dataframe['totaltested'] = pd.to_numeric(working_dataframe['totaltested'], errors='coerce')
    working_dataframe['asian'] = pd.to_numeric(working_dataframe['asian'] , errors='coerce')
    working_dataframe['black'] = pd.to_numeric(working_dataframe['black'] , errors='coerce')
    working_dataframe['white'] = pd.to_numeric(working_dataframe['white'] , errors='coerce')
    working_dataframe['hispanic'] = pd.to_numeric(working_dataframe['hispanic'] , errors='coerce')
    working_dataframe['mathpass'] = pd.to_numeric(working_dataframe['mathpass'] , errors='coerce')
    working_dataframe['lowincome'] = pd.to_numeric(working_dataframe['lowincome'] , errors='coerce')

    working_dataframe['asian'] /= working_dataframe['totaltested']
    working_dataframe['black'] /= working_dataframe['totaltested']
    working_dataframe['white'] /= working_dataframe['totaltested']
    working_dataframe['hispanic'] /= working_dataframe['totaltested']
    working_dataframe['mathpass'] /= working_dataframe['totaltested']
    working_dataframe['lowincome'] /= working_dataframe['totaltested']

    final_dataframe = working_dataframe

    return final_dataframe

In [4]:
# generate csv data from dat files
for year in range(2015, 2022):
    if year in [2020]:
        continue

    dat_file_path = base_file_path + "texas_ela_" + str(year) + ".dat"
    csv_file_path = base_file_path + "texas_ela_" + str(year) + ".csv"

    if not os.path.exists(csv_file_path):
        # If the .csv file does not exist, convert .dat to .csv
        final_csv_path = dat_to_csv_converter(dat_file_path)
    else:
        # If the .csv file does exist, use it directly
        final_csv_path = csv_file_path
        
    final_dataframes.append(final_data_generator(final_csv_path, year))
    print(f"{year} year is done")

final_dataframe = pd.concat(final_dataframes)
final_dataframe = final_dataframe[main_dataframe_cols]

final_dataframe.shape

2015 year is done
2016 year is done
2017 year is done
2018 year is done
2019 year is done
2021 year is done


(13052, 9)

In [5]:
# Find unique years in the DataFrame
unique_years = final_dataframe['year'].unique()

# Function to filter schools with complete data across all years
def filter_complete_data(group):
    group_years = group['year'].unique()
    
    # Check that the school has an entry for each year
    if set(group_years) != set(unique_years):
        return False
    
    # Check that there are no NaN values
    return not group.isna().any().any()

# Group by 'SchoolCode' and filter
final_dataframe = final_dataframe.groupby('schoolcode').filter(filter_complete_data)

final_dataframe.shape

(10290, 9)

In [6]:
final_dataframe.isna().sum()

schoolcode     0
year           0
mathpass       0
totaltested    0
lowincome      0
white          0
black          0
hispanic       0
asian          0
dtype: int64

In [7]:
final_dataframe.shape

(10290, 9)

# Import Virtual Data

In [8]:
virtual = pd.read_csv("../raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [9]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,...,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
1,Texas,TX,School,0,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,141.0,0.0,2.0,,
3,Texas,TX,School,0,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,165.0,0.0,4.0,,
5,Texas,TX,School,0,Sp Ed Co-Op,Other/alternative school,480000103621,54901200,Crosbyton CISD,Regular local school district,...,,,,,,5.0,0.0,0.0,,
7,Texas,TX,School,0,Spur School,Regular school,480000204732,63903001,Spur ISD,Regular local school district,...,,,,,,219.0,0.0,13.0,,
9,Texas,TX,School,0,Rocksprings K-12,Regular school,480000304219,69901001,Rocksprings ISD,Regular local school district,...,,,,,,255.0,0.0,7.0,,


In [10]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['year'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

In [11]:
# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'year', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'year', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

In [12]:
# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] / pivot['TotalDays'])

pivot.columns.name = None

In [13]:
pivot = pivot.drop(columns=['InPerson', "In-person", 'Virtual', 'TotalDays'])

pivot.fillna(0)

Unnamed: 0,StateAssignedSchoolID,year,Charter,VirtualPercent,Score
0,1902001,2021,0,0.0,0.0
1,1902041,2021,0,0.0,0.0
2,1902103,2021,0,0.0,0.0
3,1903001,2021,0,0.0,0.0
4,1903041,2021,0,0.0,0.0
...,...,...,...,...,...
8459,254901101,2021,0,0.0,0.0
8460,254901104,2021,0,0.0,0.0
8461,254901106,2021,0,0.0,0.0
8462,254902001,2021,0,1.0,1.0


# Begin Merge Process

In [14]:
# Find the intersection of unique SchoolCodes in both DataFrames
common_schoolcodes = set(final_dataframe['schoolcode'].unique()).intersection(set(virtual['StateAssignedSchoolID'].unique()))

# Filter both DataFrames to only include these SchoolCodes
final_dataframe = final_dataframe[final_dataframe['schoolcode'].isin(common_schoolcodes)]
virtual = virtual[virtual['StateAssignedSchoolID'].isin(common_schoolcodes)]



In [15]:
final_dataframe.shape

(10104, 9)

In [16]:
# Merge dropout_df with final_data
export_dataframe = pd.merge(final_dataframe, pivot, left_on=['schoolcode', 'year'], right_on=['StateAssignedSchoolID', 'year'], how='left')


In [17]:
export_dataframe.sort_values(by=['schoolcode', 'year'], inplace=True)

export_dataframe.head(10)


Unnamed: 0,schoolcode,year,mathpass,totaltested,lowincome,white,black,hispanic,asian,StateAssignedSchoolID,Charter,VirtualPercent,Score
0,1902001,2015,0.6,45,0.288889,0.822222,0.022222,0.044444,0.044444,,,,
1684,1902001,2016,0.473684,38,0.263158,0.868421,0.0,0.0,0.0,,,,
3368,1902001,2017,0.414634,41,0.365854,0.804878,0.073171,0.073171,0.0,,,,
5052,1902001,2018,0.595745,47,0.297872,0.87234,0.021277,0.06383,0.0,,,,
6736,1902001,2019,0.630435,46,0.586957,0.782609,0.021739,0.086957,0.0,,,,
8420,1902001,2021,0.682927,41,0.439024,0.829268,0.02439,0.121951,0.02439,1902001.0,0.0,,
1,1903001,2015,0.550562,89,0.47191,0.876404,0.033708,0.044944,0.022472,,,,
1685,1903001,2016,0.684211,95,0.547368,0.873684,0.021053,0.094737,0.010526,,,,
3369,1903001,2017,0.518868,106,0.518868,0.735849,0.075472,0.103774,0.009434,,,,
5053,1903001,2018,0.488372,129,0.527132,0.837209,0.031008,0.077519,0.0,,,,


In [18]:
export_dataframe['Charter'] = export_dataframe.groupby('schoolcode')['Charter'].ffill().bfill()
export_dataframe['StateAssignedSchoolID'] = export_dataframe.groupby('schoolcode')['StateAssignedSchoolID'].ffill().bfill()
export_dataframe["virtualper"] = export_dataframe["VirtualPercent"]
export_dataframe["schoolmode"] = export_dataframe["Score"]
export_dataframe["charter"] = export_dataframe["Charter"]
export_dataframe["hybridper"] = 0

export_dataframe = export_dataframe.fillna(0)
export_dataframe = export_dataframe.reset_index(drop=True)
export_dataframe = export_dataframe[final_cols]
export_dataframe.head(10)

Unnamed: 0,schoolcode,year,charter,mathpass,schoolmode,virtualper,hybridper,totaltested,lowincome,white,black,hispanic,asian
0,1902001,2015,0.0,0.6,0.0,0.0,0,45,0.288889,0.822222,0.022222,0.044444,0.044444
1,1902001,2016,0.0,0.473684,0.0,0.0,0,38,0.263158,0.868421,0.0,0.0,0.0
2,1902001,2017,0.0,0.414634,0.0,0.0,0,41,0.365854,0.804878,0.073171,0.073171,0.0
3,1902001,2018,0.0,0.595745,0.0,0.0,0,47,0.297872,0.87234,0.021277,0.06383,0.0
4,1902001,2019,0.0,0.630435,0.0,0.0,0,46,0.586957,0.782609,0.021739,0.086957,0.0
5,1902001,2021,0.0,0.682927,0.0,0.0,0,41,0.439024,0.829268,0.02439,0.121951,0.02439
6,1903001,2015,0.0,0.550562,0.0,0.0,0,89,0.47191,0.876404,0.033708,0.044944,0.022472
7,1903001,2016,0.0,0.684211,0.0,0.0,0,95,0.547368,0.873684,0.021053,0.094737,0.010526
8,1903001,2017,0.0,0.518868,0.0,0.0,0,106,0.518868,0.735849,0.075472,0.103774,0.009434
9,1903001,2018,0.0,0.488372,0.0,0.0,0,129,0.527132,0.837209,0.031008,0.077519,0.0


In [19]:
export_dataframe.to_csv("../final_data_components/elapass_texas.csv")