In [1]:
import pandas as pd
import numpy as np
import os

# Import Virtual Mode Data

In [68]:
virtual = pd.read_csv("./raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,...,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
0,Texas,TX,School,No,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,125.0,0.0,13.0,,
1,Texas,TX,School,No,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,141.0,0.0,2.0,,
2,Texas,TX,School,No,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,161.0,0.0,10.0,,
3,Texas,TX,School,No,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,165.0,0.0,4.0,,
4,Texas,TX,School,No,Sp Ed Co-Op,Other/alternative school,480000103621,54901200,Crosbyton CISD,Regular local school district,...,,,,,,7.0,0.0,0.0,,


In [62]:
virtual = pd.read_csv("./raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual.head()

virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]

# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['YEAR'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

# Replace 'In-person' with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].replace('In-person', 'InPerson')

# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, and LearningModel and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR'], columns='LearningModel', values='Days', fill_value=0)

# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Manually create a 'HybridPercent' column and set it to zero
pivot['HybridPercent'] = 0  # This line added

# Calculate the score for each year
pivot['Score'] = pivot['VirtualPercent']

# Reset the column names after pivot
pivot.columns.name = None

In [None]:
common_ids = set(pivot["StateAssignedSchoolID"])

# Import Dropout Data

In [17]:
### Regressor Data
data21 = pd.read_excel("./raw_data/campus-data-download-drop-2021.xlsx", thousands=',', sheet_name=2)
data19 = pd.read_excel("./raw_data/campus-data-download-drop-1819.xlsx", thousands=',', sheet_name=2)
data18 = pd.read_excel("./raw_data/campus-data-download-drop-1718.xlsx", thousands=',', sheet_name=2)
data19.head(5)

Unnamed: 0,CALC_FOR_STATE_ACCT,Gradespan,campus,campname,DISTRICT,distname,COUNTY,cntyname,REGION,regnname,...,CAMP_OVRR,CAMP_SPED,CAMP_SPEN,CAMP_SPER,CAMP_SE5D,CAMP_SE5N,CAMP_SE5R,CAMP_TTLD,CAMP_TTLN,CAMP_TTLR
0,No,712,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
1,Yes,712,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
2,No,912,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
3,Yes,912,1902001,Cayuga H S,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,1.6,41,0,0.0,<100,-,5.6,<200,-,0.5
4,No,712,1902041,Cayuga Middle,1902,Cayuga ISD,1,Anderson County,7,Kilgore,...,0.0,15,0,0.0,14,0,0.0,90,0,0.0


In [40]:
data = [data18, data19, data21]
year = [2018, 2019, 2021]
calc_columns = ["CAMPUS","CAMP_ALLD","CAMP_ALLR","CAMP_AAD","CAMP_WHD","CAMP_HSD","CAMP_ASD","CAMP_ECND", "year"]
final_columns = ["schoolcode","year","totalenroll","dropout","lowincome","black","white","hispanic","asian"]

for i in range(3):
    data[i].columns = data[i].columns.str.upper()

for i, (d, y) in enumerate(zip(data, year)):
    d['year'] = y
    data[i] = data[i][data[i]['GRADESPAN'] == 912]    
    data[i] = data[i][data[i]['CALC_FOR_STATE_ACCT'] == "Yes"]
    data[i] = data[i][calc_columns]
    mask = data[i].applymap(lambda x: '<' in str(x)).any(axis=1)
    data[i] = data[i][~mask]

common_ids = common_ids & set(data[0]['CAMPUS']) & set(data[1]['CAMPUS']) & set(data[2]['CAMPUS'])

for i in range(2):
    data[i] = data[i][data[i]['CAMPUS'].isin(common_ids)]

for i, (d, y) in enumerate(zip(data, year)):
    data[i]["schoolcode"] = data[i]["CAMPUS"].astype(float).astype(int)
    data[i]["totalenroll"] = data[i]["CAMP_ALLD"].astype(float).astype(int)
    data[i]["dropout"] = data[i]["CAMP_ALLR"].astype(float).astype(int)
    data[i]["black"] = data[i]["CAMP_AAD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["white"] = data[i]["CAMP_WHD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["hispanic"] = data[i]["CAMP_HSD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["asian"] = data[i]["CAMP_ASD"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    data[i]["lowincome"] = data[i]["CAMP_ECND"].astype(float) / data[i]["CAMP_ALLD"].astype(float)
    
for i, (d, y) in enumerate(zip(data, year)):
    data[i] = data[i][final_columns]

In [34]:
len(common_ids)

713

Unnamed: 0,schoolcode,year,totalenroll,dropout,lowincome,black,white,hispanic,asian
3,1902001,2021,175,0,0.365714,-0.017143,0.834286,0.080000,-0.005714
11,1903001,2021,371,0,0.469003,0.053908,0.773585,0.121294,-0.002695
19,1904001,2021,239,0,0.447699,0.112971,0.778243,0.071130,-0.004184
27,1906002,2021,99,0,0.424242,-0.030303,0.797980,0.101010,0.000000
35,1907001,2021,1086,0,0.787293,0.230203,0.307551,0.415285,-0.002762
...,...,...,...,...,...,...,...,...,...
17944,252902002,2021,61,0,0.524590,0.000000,0.819672,-0.049180,0.000000
17948,252903001,2021,216,0,0.564815,0.037037,0.546296,0.384259,-0.004630
17956,253901001,2021,1000,1,0.872000,0.000000,0.009000,0.991000,0.000000
17964,254901001,2021,538,2,0.804833,-0.001859,-0.001859,0.983271,-0.001859


In [67]:
len(common_ids)

713

555