# mkdata

This notebook is used to extract data from the Postgres database into a parquet file for analysis. The data is from the SFUSD project on schools.

In [1]:
import pandas as pd
from utils import sqlalchemy_engine

engine = sqlalchemy_engine()

## demographic dataframe

In [2]:
# create a demongraphic dataframe
demo_query = """
SELECT 
    school_code,
	student_group_id,
	grade,
	max(total_students_enrolled) as total_students_enrolled
FROM scores
WHERE type_id IN (7, 9)
AND total_students_enrolled is not null
GROUP BY school_code, student_group_id, grade"""

demo_df = pd.read_sql_query(demo_query, engine)

# pivot table
demo_pt = demo_df.pivot_table(index=['school_code', 'grade'], columns='student_group_id', values='total_students_enrolled', aggfunc="max")
demo_pt.reset_index(inplace=True)

# convert to ratios
for col in demo_pt.columns[3:]:
    demo_pt[col] = demo_pt[col].div(demo_pt[1]).where(demo_pt[col].notna())
    if demo_pt[demo_pt[1] < demo_pt[col]].size > 0:
        print(f"Warning: {demo_pt[demo_pt[1] < demo_pt[col]].size} rows where col 1 < col {col}")    

## scores dataframe

In [3]:
# get the scores dataframe
score_query = """SELECT 
 	school_code,
	 test_id,
	 grade,
	 (pct_std_exceeded::FLOAT / 100::FLOAT) as pct_std_exceeded,
	 (pct_std_met::FLOAT / 100::FLOAT) as pct_std_met,
	 (pct_std_met_and_above::FLOAT / 100::FLOAT) as pct_std_met_and_above
FROM scores
WHERE type_id IN (7, 9)
AND total_students_enrolled is not null
AND student_group_id = 1"""

score_df = pd.read_sql_query(score_query, engine)
score_df['test_id'] = score_df['test_id'].replace({1: 'MATH', 2: 'ENG'})

score_pt=score_df.pivot(index=['school_code', 'grade'], columns=['test_id'], values=['pct_std_exceeded', 'pct_std_met', 'pct_std_met_and_above'])
score_pt.reset_index(inplace=True)
score_pt.columns = score_pt.columns.map('_'.join).map(str.lower)
score_pt = score_pt.rename(columns={'school_code_': 'school_code', 'grade_': 'grade'})

In [None]:
# merge the two dataframes
merged_df = pd.merge(score_pt, demo_pt, on=['school_code', 'grade'], how='left')
merged_df.to_feather("merged.feather")

In [None]:

demo_labels_df = pd.read_sql_query("SELECT * FROM demographics", engine)
demo_labels_df.to_feather("demo_labels.feather")