In [1]:
import pandas as pd
import numpy as np
import os

# Import Virtual Mode Data

In [2]:
virtual = pd.read_csv("./raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,...,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
0,Texas,TX,School,No,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,125.0,0.0,13.0,,
1,Texas,TX,School,No,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,141.0,0.0,2.0,,
2,Texas,TX,School,No,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,161.0,0.0,10.0,,
3,Texas,TX,School,No,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,165.0,0.0,4.0,,
4,Texas,TX,School,No,Sp Ed Co-Op,Other/alternative school,480000103621,54901200,Crosbyton CISD,Regular local school district,...,,,,,,7.0,0.0,0.0,,


In [3]:
virtual = pd.read_csv("./raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')

virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]

# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['YEAR'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

# Replace 'In-person' with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].replace('In-person', 'InPerson')

# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Manually create a 'HybridPercent' column and set it to zero
pivot['HybridPercent'] = 0  # This line added

# Calculate the score for each year
pivot['Score'] = pivot['VirtualPercent']

# Reset the column names after pivot
pivot.columns.name = None

In [4]:
common_ids = set(pivot["StateAssignedSchoolID"])

# Import Dropout Data

In [5]:
### Regressor Data
data21 = pd.read_excel("./raw_data/campus-data-download-drop-2021.xlsx", thousands=',', sheet_name=2)
data19 = pd.read_excel("./raw_data/campus-data-download-drop-1819.xlsx", thousands=',', sheet_name=2)
data18 = pd.read_excel("./raw_data/campus-data-download-drop-1718.xlsx", thousands=',', sheet_name=2)
data19.head(5)

Unnamed: 0,CALC_FOR_STATE_ACCT,Gradespan,campus,campname,DISTRICT,distname,COUNTY,cntyname,REGION,regnname,...,CAMP_OVRR,CAMP_SPED,CAMP_SPEN,CAMP_SPER,CAMP_SE5D,CAMP_SE5N,CAMP_SE5R,CAMP_TTLD,CAMP_TTLN,CAMP_TTLR
0,No,712,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
1,Yes,712,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
2,No,912,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
3,Yes,912,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
4,No,712,1902041,Cayuga Middle,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,0.0,15,0,0.0,14,0,0.0,90,0,0.0


In [6]:
data = [data18, data19, data21]
year = [2018, 2019, 2021]
calc_columns = ["CAMPUS","CAMP_ALLD","CAMP_ALLR","CAMP_AAD","CAMP_WHD","CAMP_HSD","CAMP_ASD","CAMP_ECND", "year"]
final_columns = ["schoolcode","year","totalenroll","dropout","lowincome","black","white","hispanic","asian"]

for i in range(3):
    data[i].columns = data[i].columns.str.upper()

for i, (d, y) in enumerate(zip(data, year)):
    d['year'] = y
    data[i] = data[i][data[i]['GRADESPAN'] == 912]    
    data[i] = data[i][data[i]['CALC_FOR_STATE_ACCT'] == "Yes"]
    data[i] = data[i][calc_columns]
    mask = data[i].applymap(lambda x: '<' in str(x)).any(axis=1)
    data[i] = data[i][~mask]

common_ids = common_ids & set(data[0]['CAMPUS']) & set(data[1]['CAMPUS']) & set(data[2]['CAMPUS'])

for i in range(2):
    data[i] = data[i][data[i]['CAMPUS'].isin(common_ids)]

for i, (d, y) in enumerate(zip(data, year)):
    data[i]["schoolcode"] = data[i]["CAMPUS"].astype(float).astype(int)
    data[i]["totalenroll"] = data[i]["CAMP_ALLD"].astype(float).astype(int)
    data[i]["dropout"] = data[i]["CAMP_ALLR"].astype(float).astype(int)
    data[i]["black"] = data[i]["CAMP_AAD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["white"] = data[i]["CAMP_WHD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["hispanic"] = data[i]["CAMP_HSD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["asian"] = data[i]["CAMP_ASD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["lowincome"] = data[i]["CAMP_ECND"].astype(float) / data[i]["CAMP_ALLD"].astype(float)

    
for i, (d, y) in enumerate(zip(data, year)):
    data[i] = data[i][final_columns]

In [7]:
pivot = pivot.drop(columns=['InPerson', 'Virtual', 'TotalDays'])
pivot

Unnamed: 0,StateAssignedSchoolID,YEAR,Charter,VirtualPercent,HybridPercent,Score
0,1902001,2021,0,0.0,0,0.0
1,1902041,2021,0,0.0,0,0.0
2,1902103,2021,0,0.0,0,0.0
3,1903001,2021,0,0.0,0,0.0
4,1903041,2021,0,0.0,0,0.0
...,...,...,...,...,...,...
8459,254901101,2021,0,0.0,0,0.0
8460,254901104,2021,0,0.0,0,0.0
8461,254901106,2021,0,0.0,0,0.0
8462,254902001,2021,0,1.0,0,1.0


In [8]:
# Concatenate all the dataframes
final_data = pd.concat(data)

In [9]:
# Merge dropout_df with final_data
final_data = pd.merge(final_data, pivot, left_on=['schoolcode', 'year'], right_on=['StateAssignedSchoolID', 'YEAR'], how='left')

In [10]:
final_data = final_data.fillna(0)
final_data = final_data.drop(columns=["StateAssignedSchoolID", "YEAR"])
# Rename multiple columns
final_data = final_data.rename(columns={               
                        'VirtualPercent': 'virtualper',    
                        'HybridPercent': 'hybridper',    
                        'Score': 'schoolmode',
                        'Charter': 'charter'
                                       })

final_cols = final_data.columns.tolist()
final_cols

['schoolcode',
 'year',
 'totalenroll',
 'dropout',
 'lowincome',
 'black',
 'white',
 'hispanic',
 'asian',
 'charter',
 'virtualper',
 'hybridper',
 'schoolmode']

In [11]:
final_cols = [
'schoolcode',
 'year',
 'charter',
 'dropout',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totalenroll',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
      ]

In [12]:
final_data = final_data[final_cols]
for column in final_cols[4:]:
    if column != "totalenroll":
        final_data[column] = final_data[column] * 100

In [13]:
problematic_cols = [
 'white',
 'black',
 'hispanic',
 'asian',
      ]
rounding_cols = [
 'schoolmode',
 'virtualper',
 'hybridper',
 'totalenroll',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
      ]

final_data[problematic_cols] = final_data[problematic_cols].abs()
final_data[rounding_cols] = final_data[rounding_cols].round(4)

In [14]:
final_data

Unnamed: 0,schoolcode,year,charter,dropout,schoolmode,virtualper,hybridper,totalenroll,lowincome,white,black,hispanic,asian
0,5902001,2018,0.0,0,0.0,0.0,0.0,317,24.2902,90.2208,0.6309,7.5710,0.0000
1,7905004,2018,0.0,0,0.0,0.0,0.0,2,100.0000,0.0000,0.0000,100.0000,0.0000
2,11901005,2018,0.0,0,0.0,0.0,0.0,224,62.5000,25.0000,1.7857,70.5357,1.7857
3,11905001,2018,0.0,0,0.0,0.0,0.0,42,59.5238,28.5714,2.3810,66.6667,0.0000
4,14803004,2018,0.0,0,0.0,0.0,0.0,72,12.5000,69.4444,5.5556,15.2778,5.5556
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3462,252902002,2021,0.0,0,0.0,0.0,0.0,61,52.4590,81.9672,0.0000,4.9180,0.0000
3463,252903001,2021,0.0,0,0.0,0.0,0.0,216,56.4815,54.6296,3.7037,38.4259,0.4630
3464,253901001,2021,0.0,1,100.0,100.0,0.0,1000,87.2000,0.9000,0.0000,99.1000,0.0000
3465,254901001,2021,0.0,2,100.0,100.0,0.0,538,80.4833,0.1859,0.1859,98.3271,0.1859


# Export Texas Data

In [15]:
final_data.to_csv("./final_data_components/dropout_texas.csv")