# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import pyodbc
import os
import warnings

# Constants

In [2]:
# Define the years for each dataframe
years = [2019, 2021, 2022]

# Intialize Useful Functions

In [3]:
def safe_convert(val):
    try:
        return int(val)
    except ValueError:
        print(f"Value {val} can't be converted to int")
        return None
    
def import_mdb(MDBs, DRV, PWD, NAMES):
    
    databases = {}
    
    for MDB, NAME in zip(MDBs, NAMES):
        # connect to db
        con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
        cur = con.cursor()

        # List all tables in the database
        tables = list(map(lambda t: t.table_name, con.cursor().tables(tableType='TABLE')))

        # Initialize an empty dictionary to hold your dataframes and databases
        database = {}

        # Try to read each table one by one
        for table in tables:
            try:
                df = pd.read_sql(f'SELECT * FROM [{table}]', con)  # enclose table name in brackets
                database[table] = df
                print(f"Successfully read table: {table}")
            except Exception as e:
                print(f"Failed to read table: {table}")
                print(f"Error: {e}")
        databases[NAME] = database
        
    return databases

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', 'pandas only support SQLAlchemy connectable.*')
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)

# Import Main Data

In [4]:
# Main Data Filepaths WINDOWS
# MAIN_PATH = [
#                '../data/SRC2019/SRC2019.mdb;',
#                '../data/SRC2021/SRC2021.mdb;',
#                '../data/SRC2022/SRC2022.mdb;',
#               ]

# Main Data Filepaths MAC
MAIN_PATH = [
               '.../data/SRC2019/ARE2019.csv',
               '../data/SRC2021/ARE2021.csv',
               '../data/SRC2022/ARE2022.CSV',
              ]
MAIN_NAMES = [
               'MAIN2019',
               'MAIN2021',
               'MAIN2022',
              ]

main_data = {}
for name, filepath in zip(MAIN_NAMES, MAIN_PATH):
    main_data[name] = {'Annual Regents Exams':pd.read_csv(filepath, thousands=',')}

# Main data Windows
# main_data = import_mdb(MAIN_PATH, DRV, PWD, MAIN_NAMES)

FileNotFoundError: [Errno 2] No such file or directory: '../data/SRC2019/ARE2019.csv'

# Import Enrollment Data

In [None]:
# DRV = '{Microsoft Access Driver (*.mdb, *.accdb)}'
# PWD = 'pw'

# # Enroll Data Filepaths
# ENROLL_PATH = [
#                '../data/enrollment_2019/ENROLL2019.mdb;',
#                '../data/enrollment_2021/ENROLL2021.mdb;',
#                '../data/enrollment_2022/ENROLL2022.mdb;',
#               ]
# ENROLL_NAMES = [
#                'ENROLL2019',
#                'ENROLL2021',
#                'ENROLL2022',
#               ]


# enroll_data = import_mdb(ENROLL_PATH, DRV, PWD, ENROLL_NAMES)

# Import Dropout Rates

In [None]:
data22 = pd.read_csv("./raw_data/GRAD_RATE_AND_OUTCOMES_2022.csv", thousands=',')
data21 = pd.read_csv("./raw_data/GRAD_RATE_AND_OUTCOMES_2021.csv", thousands=',')
data19 = pd.read_csv("./raw_data/GRAD_RATE_AND_OUTCOMES_2019.csv", thousands=',')

dropout_dfs = [data19, data21, data22]

# Combine Dropout Data

In [None]:
# Remove districts, only keep schools
for i, df in enumerate(dropout_dfs):
    dropout_dfs[i] = df[df['aggregation_type'] == 'School']

#   Only keep schools which are present in all years    #
#########################################################

# Convert the 'ID' column of each DataFrame to a set
set1 = set(dropout_dfs[0]['aggregation_code'])
set2 = set(dropout_dfs[1]['aggregation_code'])
set3 = set(dropout_dfs[2]['aggregation_code'])

# Find the intersection of all 4 sets - i.e., the common IDs
common_ids = set1 & set2 & set3

# Filter each DataFrame to only include rows with a common ID
for i, df in enumerate(dropout_dfs):
    dropout_dfs[i] = df[df['aggregation_code'].isin(common_ids)]
    
###########################################################

common_ids = set(df['aggregation_code'])

# Initialize a list to store the updated dataframes
updated_dfs = []

# Iterate over the dropout dataframes and the years together
for year, df in zip(years, dropout_dfs):
    # Add a new column 'year' to the dataframe
    df['year'] = year
    # Append the updated dataframe to the list
    updated_dfs.append(df)

# Concatenate the updated dataframes together
dropout_df = pd.concat(updated_dfs)

# drop disttricts from the dataframe
dropout_df = dropout_df[~dropout_df['aggregation_code'].astype(str).str.endswith('0000.0')]

# Reset the index of the combined dataframe
dropout_df = dropout_df.reset_index(drop=True)

common_ids = set(dropout_df['aggregation_code'])

In [None]:
# Define a function to check if 'comparison' year is in 'membership_desc'
def check_year_in_desc(row):
    return str(row['comparison']) in row['membership_desc'] and "August" not in row['membership_desc']

dropout_df = dropout_df[dropout_df['subgroup_name'] == 'All Students']
dropout_df['report_school_year'] = dropout_df['report_school_year'].apply(lambda x: int(str(x).split('-')[1]))
dropout_df['report_school_year'] = dropout_df['report_school_year'].apply(lambda x: x + 2000 if x < 100 else x)

# Convert the 'report_school_year' to int and subtract 4
dropout_df['comparison'] = dropout_df['report_school_year'] - 4

# Apply the function to each row of dropout_df
dropout_df = dropout_df[dropout_df.apply(check_year_in_desc, axis=1)]

# Combine Main Data

### Remove Districts, Keep Schools Common Across Databases

In [None]:
common_entity_ids = set(main_data['MAIN2019']['Annual Regents Exams']['ENTITY_CD']) 

for database in main_data:
    current_data = main_data[database]['Annual Regents Exams']
    current_data['ENTITY_CD'] = current_data['ENTITY_CD'].apply(safe_convert)
    current_data = current_data[~current_data['ENTITY_CD'].astype(str).str.endswith('0000')]

    common_entity_ids = set(current_data['ENTITY_CD']) & common_ids
    
for database in main_data:
    current_data = main_data[database]['Annual Regents Exams']
    main_data[database]['Annual Regents Exams'] = current_data[current_data['ENTITY_CD'].isin(common_ids)]

In [None]:
for database in main_data:
    current_data = main_data[database]['Annual Regents Exams']
    main_data[database] = {subject: current_data[current_data['SUBJECT'] == subject] for subject in current_data['SUBJECT'].unique()}

In [None]:
old_to_new = {
    'REG_PHYS_PS':'Regents Phy Set/Physics',
    'REG_NF_GLHIST':'Regents NF Global History',
    'REG_COMENG':'Regents Common Core English Language Art', 
    'REG_ESCI_PS':'Regents Phy Set/Earth Sci',
    'REG_CHEM_PS':'Regents Phy Set/Chemistry', 
    'REG_COMALG1':'Regents Common Core Algebra I', 
    'REG_COMGEOM':'Regents Common Core Geometry', 
    'REG_LENV':'Regents Living Environment',
    'REG_USHG_RV':"Regents US History&Gov't"
}

new_to_old = {}

for key in old_to_new:
    new_to_old[old_to_new[key]] = key

tests = (set(old_to_new[test] for test in old_to_new) 
         & set(test for test in main_data['MAIN2021']) 
         & set(test for test in main_data['MAIN2022']))

### Calculate Demographic Percentages

In [None]:
for year, database in zip(years, MAIN_NAMES):
    current_df = None
    current_df = (main_data[database]['Regents Common Core English Language Art'] 
                  if 'Regents Common Core English Language Art' in main_data[database]
                else main_data[database][new_to_old['Regents Common Core English Language Art']])

    # Filter the DataFrame to only include rows where SUBGROUP_NAME == 'All Students'
    total_students_df = current_df[(current_df['SUBGROUP_NAME'] == 'All Students') & (current_df['YEAR'] == year)][['ENTITY_CD', 'TESTED', 'YEAR']]
    print("checkpoint 1")

    # Merge the total students for 'All Students' back into the original DataFrame
    current_df = pd.merge(current_df, total_students_df, on=['ENTITY_CD', 'YEAR'], how='left', suffixes=('', '_total'))
    print("checkpoint 2")

    # List of subgroups of interest
    KEPT_SUBGROUPS = ['Male', 'Female', 'White', 'Hispanic or Latino', 'Black or African American', 'Asian or Native Hawaiian/Other Pacific Islander','Economically Disadvantaged']

    # List to store DataFrames
    df_list = []
    columns = "ENTITY_CD  ENTITY_NAME YEAR SUBJECT TESTED TESTED_total NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF"
    columns = columns.split()

    # Get ENTITY_NAME for 'All Students' subgroup
    multiple_df = current_df[current_df['SUBGROUP_NAME'] == 'All Students'][columns]
    print("checkpoint 3")

    # Loop over each subgroup and calculate percentage
    for subgroup in KEPT_SUBGROUPS:
        temp_df = current_df[(current_df['SUBGROUP_NAME'] == subgroup) & (current_df['YEAR'] == year)].copy()
        subgroup = subgroup.upper()
        temp_df[subgroup + '_PCT'] = temp_df['TESTED'] / temp_df['TESTED_total'] * 100
        temp_df = temp_df[['ENTITY_CD', subgroup + '_PCT']]  # Keep 'ENTITY_CD' in each temp_df
        df_list.append(temp_df)
    print("checkpoint 4")

    # Merge all DataFrames on ENTITY_CD
    result_df = multiple_df
    for temp_df in df_list:
        result_df = result_df.merge(temp_df, on='ENTITY_CD', how='outer')
    print("checkpoint 5")


    # Drop observations where TESTED is less than 4
    result_df = result_df[result_df['TESTED_total'] >= 2]
    print("checkpoint 6")

    # Fill NaN values with 0
    result_df = result_df.fillna(0)
    print("checkpoint 7")

    cols = "NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF".split()
    for col in cols:
        result_df[col] = result_df[col].replace('s', 0)
    print("checkpoint 7")
    if 'Regents Common Core English Language Art' in main_data[database]:
        main_data[database]['Regents Common Core English Language Art'] = result_df  # store results in new dictionary instead of main_data
    else:
        main_data[database][new_to_old['Regents Common Core English Language Art']] = result_df

In [None]:
# current_df = None
# if 'Regents Common Core English Language Art' in main_data['MAIN2021']:
#     current_df = main_data['MAIN2021']['Regents Common Core English Language Art']
# else:
#     current_df = main_data['MAIN2021'][new_to_old['Regents Common Core English Language Art']]

# # Filter the DataFrame to only include rows where SUBGROUP_NAME == 'All Students'
# total_students_df = current_df[(current_df['SUBGROUP_NAME'] == 'All Students') & (current_df['YEAR'] == 2021)][['ENTITY_CD', 'TESTED', 'YEAR']]

# # Merge the total students for 'All Students' back into the original DataFrame
# current_df = pd.merge(current_df, total_students_df, on=['ENTITY_CD', 'YEAR'], how='left', suffixes=('', '_total'))

# # List of subgroups of interest
# KEPT_SUBGROUPS = ['Male', 'Female', 'White', 'Hispanic or Latino', 'Black or African American', 'Asian or Native Hawaiian/Other Pacific Islander','Economically Disadvantaged']

# # List to store DataFrames
# df_list = []
# columns = "ENTITY_CD  ENTITY_NAME YEAR SUBJECT TESTED TESTED_total NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF"
# columns = columns.split()

# # Get ENTITY_NAME for 'All Students' subgroup
# multiple_df = current_df[current_df['SUBGROUP_NAME'] == 'All Students'][columns]

# # Loop over each subgroup and calculate percentage
# for subgroup in KEPT_SUBGROUPS:
#     temp_df = current_df[(current_df['SUBGROUP_NAME'] == subgroup) & (current_df['YEAR'] == 2021)].copy()
#     subgroup = subgroup.upper()
#     temp_df[subgroup + '_PCT'] = temp_df['TESTED'] / temp_df['TESTED_total'] * 100
#     temp_df = temp_df[['ENTITY_CD', subgroup + '_PCT']]  # Keep 'ENTITY_CD' in each temp_df
#     df_list.append(temp_df)

# # Merge all DataFrames on ENTITY_CD
# result_df = multiple_df
# for temp_df in df_list:
#     result_df = result_df.merge(temp_df, on='ENTITY_CD', how='outer')
    

# # Drop observations where TESTED is less than 4
# result_df = result_df[result_df['TESTED_total'] >= 2]

# # Fill NaN values with 0
# result_df = result_df.fillna(0)

# cols = "NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF".split()
# for col in cols:
#     result_df[col] = result_df[col].replace('s', 0)

# main_data['MAIN2021']['Regents Common Core English Language Art'] = result_df  # store results in new dictionary instead of main_data

In [None]:
# current_df = None
# if 'Regents Common Core English Language Art' in main_data['MAIN2022']:
#     current_df = main_data['MAIN2022']['Regents Common Core English Language Art']
# else:
#     current_df = main_data['MAIN2022'][new_to_old['Regents Common Core English Language Art']]

# # Filter the DataFrame to only include rows where SUBGROUP_NAME == 'All Students'
# total_students_df = current_df[(current_df['SUBGROUP_NAME'] == 'All Students') & (current_df['YEAR'] == 2022)][['ENTITY_CD', 'TESTED', 'YEAR']]

# # Merge the total students for 'All Students' back into the original DataFrame
# current_df = pd.merge(current_df, total_students_df, on=['ENTITY_CD', 'YEAR'], how='left', suffixes=('', '_total'))

# # List of subgroups of interest
# KEPT_SUBGROUPS = ['Male', 'Female', 'White', 'Hispanic or Latino', 'Black or African American', 'Asian or Native Hawaiian/Other Pacific Islander','Economically Disadvantaged']

# # List to store DataFrames
# df_list = []
# columns = "ENTITY_CD  ENTITY_NAME YEAR SUBJECT TESTED TESTED_total NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF"
# columns = columns.split()

# # Get ENTITY_NAME for 'All Students' subgroup
# multiple_df = current_df[current_df['SUBGROUP_NAME'] == 'All Students'][columns]

# # Loop over each subgroup and calculate percentage
# for subgroup in KEPT_SUBGROUPS:
#     temp_df = current_df[(current_df['SUBGROUP_NAME'] == subgroup) & (current_df['YEAR'] == 2022)].copy()
#     subgroup = subgroup.upper()
#     temp_df[subgroup + '_PCT'] = temp_df['TESTED'] / temp_df['TESTED_total'] * 100
#     temp_df = temp_df[['ENTITY_CD', subgroup + '_PCT']]  # Keep 'ENTITY_CD' in each temp_df
#     df_list.append(temp_df)

# # Merge all DataFrames on ENTITY_CD
# result_df = multiple_df
# for temp_df in df_list:
#     result_df = result_df.merge(temp_df, on='ENTITY_CD', how='outer')
    

# # Drop observations where TESTED is less than 4
# result_df = result_df[result_df['TESTED_total'] >= 2]

# # Fill NaN values with 0
# result_df = result_df.fillna(0)

# cols = "NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF".split()
# for col in cols:
#     result_df[col] = result_df[col].replace('s', 0)

# main_data['MAIN2022']['Regents Common Core English Language Art'] = result_df  # store results in new dictionary instead of main_data

In [None]:
# current_df = None
# if 'Regents Common Core English Language Art' in main_data['MAIN2019']:
#     current_df = main_data['MAIN2019']['Regents Common Core English Language Art']
# else:
#     current_df = main_data['MAIN2019'][new_to_old['Regents Common Core English Language Art']]

# # Filter the DataFrame to only include rows where SUBGROUP_NAME == 'All Students'
# total_students_df = current_df[(current_df['SUBGROUP_NAME'] == 'All Students') & (current_df['YEAR'] == 2019)][['ENTITY_CD', 'TESTED', 'YEAR']]

# # Merge the total students for 'All Students' back into the original DataFrame
# current_df = pd.merge(current_df, total_students_df, on=['ENTITY_CD', 'YEAR'], how='left', suffixes=('', '_total'))

# # List of subgroups of interest
# KEPT_SUBGROUPS = ['Male', 'Female', 'White', 'Hispanic or Latino', 'Black or African American', 'Asian or Native Hawaiian/Other Pacific Islander','Economically Disadvantaged']

# # List to store DataFrames
# df_list = []
# columns = "ENTITY_CD  ENTITY_NAME YEAR SUBJECT TESTED TESTED_total NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF"
# columns = columns.split()

# # Get ENTITY_NAME for 'All Students' subgroup
# multiple_df = current_df[current_df['SUBGROUP_NAME'] == 'All Students'][columns]

# # Loop over each subgroup and calculate percentage
# for subgroup in KEPT_SUBGROUPS:
#     temp_df = current_df[(current_df['SUBGROUP_NAME'] == subgroup) & (current_df['YEAR'] == 2019)].copy()
#     subgroup = subgroup.upper()
#     temp_df[subgroup + '_PCT'] = temp_df['TESTED'] / temp_df['TESTED_total'] * 100
#     temp_df = temp_df[['ENTITY_CD', subgroup + '_PCT']]  # Keep 'ENTITY_CD' in each temp_df
#     df_list.append(temp_df)

# # Merge all DataFrames on ENTITY_CD
# result_df = multiple_df
# for temp_df in df_list:
#     result_df = result_df.merge(temp_df, on='ENTITY_CD', how='outer')
    

# # Drop observations where TESTED is less than 4
# result_df = result_df[result_df['TESTED_total'] >= 2]

# # Fill NaN values with 0
# result_df = result_df.fillna(0)

# cols = "NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF PER_PROF".split()
# for col in cols:
#     result_df[col] = result_df[col].replace('s', 0)

# main_data['MAIN2019'][new_to_old['Regents Common Core English Language Art']] = result_df  # store results in new dictionary instead of main_data

### Concatenate All Dataframes

In [None]:
final_data = pd.concat([main_data['MAIN2019'][new_to_old['Regents Common Core English Language Art']], main_data['MAIN2021']['Regents Common Core English Language Art'], main_data['MAIN2022']['Regents Common Core English Language Art']])

In [None]:
final_data['SUBJECT'] = 'Regents Common Core English Language Art'

In [None]:
# Convert 'YEAR' in final_data to int
final_data['YEAR'] = final_data['YEAR'].astype(int)

# Select specific columns from dropout_df
dropout_subset = dropout_df[['aggregation_code', 'report_school_year', 'dropout_pct']]

# Merge dropout_df with final_data
final_data = pd.merge(final_data, dropout_subset, left_on=['ENTITY_CD', 'YEAR'], right_on=['aggregation_code', 'report_school_year'], how='left')

# Replace '-' with np.nan
final_data['dropout_pct'] = final_data['dropout_pct'].replace('-', np.nan)

# Remove '%' from 'dropout_pct' and convert to float
final_data['dropout_pct'] = final_data['dropout_pct'].str.rstrip('%').astype('float')

final_data['dropout_pct'] = final_data['dropout_pct'].fillna(0)

In [None]:
final_data

# Import Virtual Mode Data

In [None]:
virtual = pd.read_csv("../data/New_York_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [None]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

In [None]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['YEAR'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

# Replace 'In-person' with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].replace('In-person', 'InPerson')

# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel', 'Charter', 'DistrictName'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR', 'Charter', 'DistrictName'], columns='LearningModel', values='Days', fill_value=0)

# # Group by School, Year, and LearningModel and sum the number of days
# grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel'])['Days'].sum().reset_index()

# # Pivot the data so we have separate columns for each learning model
# pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR'], columns='LearningModel', values='Days', fill_value=0)

# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['Hybrid'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']
pivot['HybridPercent'] = pivot['Hybrid'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] + 0.5 * pivot['Hybrid']) / pivot['TotalDays']

# Reset the column names after pivot
pivot.columns.name = None

In [None]:
pivot = pivot.drop(columns=['InPerson', 'Hybrid', 'Virtual', 'TotalDays'])
pivot

In [None]:
# Merge dropout_df with final_data
final_data = pd.merge(final_data, pivot, left_on=['ENTITY_CD', 'YEAR'], right_on=['StateAssignedSchoolID', 'YEAR'], how='left')

In [None]:
final_data = final_data.drop(columns=[col for col in "ENTITY_NAME TESTED_total NUM_LEVEL1 PER_LEVEL1 NUM_LEVEL2 PER_LEVEL2 NUM_LEVEL3 PER_LEVEL3 NUM_LEVEL4 PER_LEVEL4 NUM_LEVEL5 PER_LEVEL5 NUM_PROF aggregation_code report_school_year StateAssignedSchoolID MALE_PCT FEMALE_PCT dropout_pct SUBJECT".split()])

In [None]:
# Rename multiple columns
final_data = final_data.rename(columns={'ENTITY_CD': 'schoolcode', 
                        'TESTED': 'totalenroll', 
                        'PER_PROF': 'elapass', 
                        'WHITE_PCT': 'white',
                        'HISPANIC OR LATINO_PCT': 'hispanic',     
                        'BLACK OR AFRICAN AMERICAN_PCT': 'black',         
                        'ASIAN OR NATIVE HAWAIIAN/OTHER PACIFIC ISLANDER_PCT': 'asian',                     
                        'ECONOMICALLY DISADVANTAGED_PCT': 'lowincome',                                         
                        'VirtualPercent': 'virtualper',    
                        'HybridPercent': 'hybridper',    
                        'Score': 'schoolmode',    
                        'YEAR': 'year',    
                        'Charter': "charter"
                        'DistrictName': "district",
                                       })

In [None]:
# Identify 'schoolcode' values of rows in 2021 where 'schoolmode' is NaN
schoolcodes_to_remove = final_data.loc[(final_data['year'] == 2021) & (final_data['schoolmode'].isna()), 'schoolcode'].unique()

# Remove all rows with those 'schoolcode' values
final_data = final_data.loc[~final_data['schoolcode'].isin(schoolcodes_to_remove)]

In [None]:
final_data = final_data.fillna(0)
final_cols = [
'schoolcode',
 'year',
 'charter',
 'elapass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totalenroll',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
      ]

In [None]:
final_data = final_data[final_cols]
rounding_cols = [
 'schoolmode',
 'virtualper',
 'hybridper',
 'totalenroll',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
      ]

final_data[rounding_cols] = final_data[rounding_cols].round(4)

In [None]:
# Sort the data by school and year
final_data.sort_values(['schoolcode', 'year'], inplace=True)

# Replace 0 with NaN in 'district' column
final_data['district'].replace(0, pd.NA, inplace=True)

# Now apply the 'ffill' and 'bfill' methods
final_data['district'] = final_data.groupby('schoolcode')['district'].apply(lambda group: group.ffill().bfill())

final_data = final_data.dropna(subset=['district'])

In [None]:
final_data

# Export NYC Data

In [None]:
first_export = final_data[final_data["year"] != 2022]

first_export.to_csv("./final_data_components/elapass_new_york.csv")

In [None]:
# final_data.to_csv("for_running.csv")