In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("ELA_data_2013_2023.csv")

### **Data Feasibility Review & Data Cleaning/Preprocessing**

The initial data contains 626,462 rows and 18 columns. After filtering for years 2018, 2019, and 2022, the data contains 220,443 rows. Almost all columns do not have missing valuesm with the exception for school name. This is due to the data containing citywide and district level metrics, so school name is not applicable. 

The data was filtered to contain only years surrounding the COVID-19 pandemic, as this is the time period of interest. Unfortunately, data was not given for the years 2020 and 2021, due to cancelled and optional testing, respectively. All scores and levels (number and percentages) were converted to floats. The year was converted to an object. Groups with five or less tested students were suppressed with a value "s". Within this data, 58,872 rows are missing observations for scores and level breakdowns. These observations have been omitted.

In [3]:
from data_cleaning import cleaning_data, change_variable_type
school_data = cleaning_data(data) # filter for year, rename some vars

In [4]:
# school_data.info() # check

In [5]:
# convert num columns
cols = ['mean_scale_score','level_1_count','level_2_count','level_3_count', 'level_4_count', 'level_4_percentage', 'level_3_4_count',
        'level_1_percentage','level_2_percentage','level_3_percentage','level_4_percentage','level_3_4_percentage']
school_data = change_variable_type(school_data,cols)
# change year to ob
school_data['Year'] = school_data['Year'].astype('object')

In [6]:
school_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 220443 entries, 7 to 626427
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Report Category         220443 non-null  object 
 1   Geographic Subdivision  220443 non-null  object 
 2   school_name             207431 non-null  object 
 3   Grade                   220443 non-null  object 
 4   Year                    220443 non-null  object 
 5   Student Category        220443 non-null  object 
 6   number_tested           220443 non-null  int64  
 7   mean_scale_score        161571 non-null  float64
 8   level_1_count           161571 non-null  float64
 9   level_1_percentage      161571 non-null  float64
 10  level_2_count           161571 non-null  float64
 11  level_2_percentage      161571 non-null  float64
 12  level_3_count           161571 non-null  float64
 13  level_3_percentage      161571 non-null  float64
 14  level_4_count           1

In [7]:
school_data

Unnamed: 0,Report Category,Geographic Subdivision,school_name,Grade,Year,Student Category,number_tested,mean_scale_score,level_1_count,level_1_percentage,level_2_count,level_2_percentage,level_3_count,level_3_percentage,level_4_count,level_4_percentage,level_3_4_count,level_3_4_percentage
7,Citywide,Citywide,,3,2022,All Students,50967,600.0,9711.0,19.1,16155.0,31.7,20432.0,40.1,4669.0,9.2,25101.0,49.2
8,Citywide,Citywide,,4,2022,All Students,53196,597.0,13506.0,25.4,16492.0,31.0,12166.0,22.9,11032.0,20.7,23198.0,43.6
9,Citywide,Citywide,,5,2022,All Students,54122,602.0,16256.0,30.0,16782.0,31.0,10907.0,20.2,10177.0,18.8,21084.0,39.0
10,Citywide,Citywide,,6,2022,All Students,53390,603.0,13422.0,25.1,9928.0,18.6,11306.0,21.2,18734.0,35.1,30040.0,56.3
11,Citywide,Citywide,,7,2022,All Students,55650,606.0,10364.0,18.6,16007.0,28.8,15572.0,28.0,13707.0,24.6,29279.0,52.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626423,School,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,7,2018,SWD,27,587.0,18.0,66.7,7.0,25.9,2.0,7.4,0.0,0.0,2.0,7.4
626424,School,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,8,2018,Not SWD,67,593.0,12.0,17.9,41.0,61.2,12.0,17.9,2.0,3.0,14.0,20.9
626425,School,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,8,2018,SWD,23,586.0,12.0,52.2,7.0,30.4,4.0,17.4,0.0,0.0,4.0,17.4
626426,School,32K562,EVERGREEN MIDDLE SCHOOL FOR URBAN EXPLORATION,All Grades,2018,Not SWD,253,595.0,68.0,26.9,119.0,47.0,54.0,21.3,12.0,4.7,66.0,26.1


### **Ethics and Bias Considerations**

### **Plots, Statistical Summaries, and Insights**

In [8]:
import numpy as np
import holoviews as hv
import hvplot.pandas

In [9]:
def plotting_sums(df):
    ''' Displays average mean scores per year. Returns a combined line and scatter'''
    filtered_df = df[df['Grade'] == 'All Grades']
    filtered_df = filtered_df[filtered_df['Year'].isin([2018,2019,2022])]
    avg_score = filtered_df.groupby('Year')['mean_scale_score'].mean().reset_index()
    avg_score_line = avg_score.hvplot.line(x = 'Year', y = 'mean_scale_score', color = 'teal')
    avg_score_scatter = avg_score.hvplot.scatter(x = 'Year', y = 'mean_scale_score', color = 'teal')
    avg_score_combined = avg_score_scatter * avg_score_line
    avg_score_combined.opts(title = 'Average Mean Scale Score, per Year')
    return avg_score_combined


In [10]:
plotting_sums(school_data)

In [None]:
# filter for years, school level, all grades

school_data_2019 = school_data[
    (school_data['Year'] == 2019) &
    (school_data['Report Category'] == 'School') &
    (school_data['Grade'] == 'All Grades')

]

school_data_2022 = school_data[
    (school_data['Year'] == 2022) &
    (school_data['Report Category'] == 'School') &
    (school_data['Grade'] == 'All Grades')

]

In [31]:
from holoviews import opts

In [48]:
# mean scale score distribution 2019 & 2022
p1 = school_data_2019.hvplot.hist(
    y = 'mean_scale_score',
    bins = 30,
    title = 'Mean Scale Score Distribution - 2019'
)

p2 = school_data_2022.hvplot.hist(

    y = 'mean_scale_score',
    bins = 30,
    title = 'Mean Scale Score Distribution - 2022'
)

plot = (p1 + p2).opts(opts.Histogram(xlabel = "Mean Scale Score"))
plot

In [47]:
school_data.hvplot.box(
    y='mean_scale_score',
    by='Year',
    height = 600,
    width = 700)

In [None]:
# levels over years
school_data.groupby('Year')[
    ['level_1_percentage',
     'level_2_percentage',
     'level_3_percentage',
     'level_4_percentage']
].mean().hvplot.line()

In [None]:
# does school size affect score
school_data_2019.hvplot.scatter(
    x = 'mean_scale_score',
    y = 'number_tested'
    
)

In [51]:
school_data_2019.hvplot.scatter(
    x  = 'mean_scale_score',
    y = 'level_3_4_percentage'
)

### **Data Limitations**

### **Implications for Methods**