In [1]:
import pandas as pd

In [2]:
def dat_to_csv_converter(dat_file_path):
    # Step 1: Read the .dat file using pandas
    # Assume .dat file is comma seperated
    try:
        data = pd.read_csv(dat_file_path, delimiter=',')
        csv_file_path = dat_file_path[:-4] + ".csv"
        # Step 3: Write the data to a .csv file
        data.to_csv(csv_file_path, index=False)  # Set index=False to omit row indices in the CSV
        return csv_file_path
    except Exception as e:
        print(e)

base_file_path = '../raw_data/'

final_dataframes = []

In [3]:
final_cols = [
 'schoolcode',
 'year',
 'charter',
 'mathpass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

main_dataframe_cols = [
 'schoolcode',
 'year',
 'mathpass',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

def final_data_generator(csv_file_path, year):
    raw_dataframe = pd.read_csv(csv_file_path)
    working_dataframe = raw_dataframe.copy()

    working_dataframe.columns = working_dataframe.columns.str.lower()
    working_dataframe['year'] = year
    working_dataframe['schoolcode'] = working_dataframe['campus']
    working_dataframe['district'] = working_dataframe['district']
    working_dataframe['totaltested'] = working_dataframe['a1_all_d'] 
    working_dataframe['asian'] = working_dataframe['a1_etha_d']
    working_dataframe['black'] = working_dataframe['a1_ethb_d']
    working_dataframe['white'] = working_dataframe['a1_ethw_d']
    working_dataframe['hispanic'] = working_dataframe['a1_ethh_d']
    working_dataframe['lowincome'] = working_dataframe['a1_eco2_d'] + working_dataframe['a1_eco1_d']
    working_dataframe['mathpass'] = working_dataframe['a1_all_meetsgl_nm'] if 'a1_all_meetsgl_nm' in working_dataframe else working_dataframe ["a1_all_satis_rec_nm"]  
    
    working_dataframe['asian'] /= working_dataframe['totaltested']
    working_dataframe['black'] /= working_dataframe['totaltested']
    working_dataframe['white'] /= working_dataframe['totaltested']
    working_dataframe['hispanic'] /= working_dataframe['totaltested']
    working_dataframe['mathpass'] /= working_dataframe['totaltested']
    working_dataframe['lowincome'] /= working_dataframe['totaltested']

    final_dataframe = working_dataframe

    return final_dataframe

In [4]:
# generate csv data from dat files
for year in range(2015, 2022):
    if year == 2020: continue
    current_file_path = base_file_path + "texas_math_" + str(year) + ".dat"
    final_csv_path = dat_to_csv_converter(current_file_path)
    final_dataframes.append(final_data_generator(final_csv_path, year))

final_dataframe = pd.concat(final_dataframes)
final_dataframe = final_dataframe[main_dataframe_cols]

final_dataframe.shape

(20624, 9)

In [5]:
# Find unique years in the DataFrame
unique_years = final_dataframe['year'].unique()

# Function to filter schools with complete data across all years
def filter_complete_data(group):
    group_years = group['year'].unique()
    
    # Check that the school has an entry for each year
    if set(group_years) != set(unique_years):
        return False
    
    # Check that there are no NaN values
    return not group.isna().any().any()

# Group by 'SchoolCode' and filter
final_dataframe = final_dataframe.groupby('schoolcode').filter(filter_complete_data)

final_dataframe.shape

(16644, 9)

In [6]:
final_dataframe.isna().sum()

schoolcode     0
year           0
mathpass       0
totaltested    0
lowincome      0
white          0
black          0
hispanic       0
asian          0
dtype: int64

In [7]:
final_dataframe.shape

(16644, 9)

# Import Virtual Data

In [8]:
virtual = pd.read_csv("../raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [9]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,...,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
1,Texas,TX,School,0,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,141.0,0.0,2.0,,
3,Texas,TX,School,0,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,165.0,0.0,4.0,,
5,Texas,TX,School,0,Sp Ed Co-Op,Other/alternative school,480000103621,54901200,Crosbyton CISD,Regular local school district,...,,,,,,5.0,0.0,0.0,,
7,Texas,TX,School,0,Spur School,Regular school,480000204732,63903001,Spur ISD,Regular local school district,...,,,,,,219.0,0.0,13.0,,
9,Texas,TX,School,0,Rocksprings K-12,Regular school,480000304219,69901001,Rocksprings ISD,Regular local school district,...,,,,,,255.0,0.0,7.0,,


In [10]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['year'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

In [11]:
# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'year', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'year', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

In [12]:
# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] / pivot['TotalDays'])

pivot.columns.name = None

In [13]:
pivot = pivot.drop(columns=['InPerson', "In-person", 'Virtual', 'TotalDays'])

pivot.fillna(0)

Unnamed: 0,StateAssignedSchoolID,year,Charter,VirtualPercent,Score
0,1902001,2021,0,0.0,0.0
1,1902041,2021,0,0.0,0.0
2,1902103,2021,0,0.0,0.0
3,1903001,2021,0,0.0,0.0
4,1903041,2021,0,0.0,0.0
...,...,...,...,...,...
8459,254901101,2021,0,0.0,0.0
8460,254901104,2021,0,0.0,0.0
8461,254901106,2021,0,0.0,0.0
8462,254902001,2021,0,1.0,1.0


# Begin Merge Process

In [14]:
# Find the intersection of unique SchoolCodes in both DataFrames
common_schoolcodes = set(final_dataframe['schoolcode'].unique()).intersection(set(virtual['StateAssignedSchoolID'].unique()))

# Filter both DataFrames to only include these SchoolCodes
final_dataframe = final_dataframe[final_dataframe['schoolcode'].isin(common_schoolcodes)]
virtual = virtual[virtual['StateAssignedSchoolID'].isin(common_schoolcodes)]



In [15]:
final_dataframe.shape

(16446, 9)

In [16]:
# Merge dropout_df with final_data
export_dataframe = pd.merge(final_dataframe, pivot, left_on=['schoolcode', 'year'], right_on=['StateAssignedSchoolID', 'year'], how='left')


In [17]:
export_dataframe.sort_values(by=['schoolcode', 'year'], inplace=True)

export_dataframe.head(10)


Unnamed: 0,schoolcode,year,mathpass,totaltested,lowincome,white,black,hispanic,asian,StateAssignedSchoolID,Charter,VirtualPercent,Score
0,1902001,2015,0.577778,45,0.288889,0.844444,0.022222,0.022222,0.044444,,,,
2741,1902001,2016,0.75,40,0.25,0.875,0.0,0.0,0.0,,,,
5482,1902001,2017,0.589744,39,0.358974,0.820513,0.051282,0.076923,0.0,,,,
8223,1902001,2018,0.697674,43,0.325581,0.860465,0.023256,0.069767,0.0,,,,
10964,1902001,2019,0.680851,47,0.595745,0.787234,0.021277,0.085106,0.0,,,,
13705,1902001,2021,0.65,40,0.45,0.825,0.025,0.125,0.025,1902001.0,0.0,,
1,1903001,2015,0.338028,71,0.478873,0.901408,0.042254,0.028169,0.0,,,,
2742,1903001,2016,0.260274,73,0.547945,0.876712,0.027397,0.082192,0.013699,,,,
5483,1903001,2017,0.383721,86,0.581395,0.744186,0.093023,0.093023,0.0,,,,
8224,1903001,2018,0.612245,98,0.530612,0.826531,0.030612,0.091837,0.0,,,,


In [18]:
export_dataframe['Charter'] = export_dataframe.groupby('schoolcode')['Charter'].ffill().bfill()
export_dataframe['StateAssignedSchoolID'] = export_dataframe.groupby('schoolcode')['StateAssignedSchoolID'].ffill().bfill()
export_dataframe["virtualper"] = export_dataframe["VirtualPercent"]
export_dataframe["schoolmode"] = export_dataframe["Score"]
export_dataframe["charter"] = export_dataframe["Charter"]
export_dataframe["hybridper"] = 0

export_dataframe = export_dataframe.fillna(0)
export_dataframe = export_dataframe.reset_index(drop=True)
export_dataframe = export_dataframe[final_cols]
export_dataframe.head(10)

Unnamed: 0,schoolcode,year,charter,mathpass,schoolmode,virtualper,hybridper,totaltested,lowincome,white,black,hispanic,asian
0,1902001,2015,0.0,0.577778,0.0,0.0,0,45,0.288889,0.844444,0.022222,0.022222,0.044444
1,1902001,2016,0.0,0.75,0.0,0.0,0,40,0.25,0.875,0.0,0.0,0.0
2,1902001,2017,0.0,0.589744,0.0,0.0,0,39,0.358974,0.820513,0.051282,0.076923,0.0
3,1902001,2018,0.0,0.697674,0.0,0.0,0,43,0.325581,0.860465,0.023256,0.069767,0.0
4,1902001,2019,0.0,0.680851,0.0,0.0,0,47,0.595745,0.787234,0.021277,0.085106,0.0
5,1902001,2021,0.0,0.65,0.0,0.0,0,40,0.45,0.825,0.025,0.125,0.025
6,1903001,2015,0.0,0.338028,0.0,0.0,0,71,0.478873,0.901408,0.042254,0.028169,0.0
7,1903001,2016,0.0,0.260274,0.0,0.0,0,73,0.547945,0.876712,0.027397,0.082192,0.013699
8,1903001,2017,0.0,0.383721,0.0,0.0,0,86,0.581395,0.744186,0.093023,0.093023,0.0
9,1903001,2018,0.0,0.612245,0.0,0.0,0,98,0.530612,0.826531,0.030612,0.091837,0.0


In [19]:
export_dataframe.to_csv("../final_data_components/mathpass_texas.csv")