In [None]:
!conda install seaborn -y

Collecting package metadata (current_repodata.json): done
Solving environment: / 

In [1]:
# Load the uploaded CSV, inspect, and prep some convenience columns for analysis and plotting.
# We'll create numeric versions of the class counts and boolean flags, and preview the data.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Read CSV with provided encoding
la_df = pd.read_csv('LA_Schools_with_AdvancedSTEM.csv', encoding='ascii')

# Ensure expected columns exist and coerce numeric counts
count_cols = ['Num_Calc_Classes', 'Num_CS_Classes', 'Num_Physics_Classes']
for c in count_cols:
    la_df[c] = pd.to_numeric(la_df[c], errors='coerce')

# Create total advanced STEM offerings as a simple baseline outcome
la_df['Total_Adv_STEM'] = la_df['Num_Calc_Classes'].fillna(0) + la_df['Num_CS_Classes'].fillna(0) + la_df['Num_Physics_Classes'].fillna(0)

# Normalize HOLC grade to categorical ordered A-D if present; here grade already looks like A-D/C/B/D etc.
la_df['grade'] = la_df['grade'].astype(str).str.strip().str.upper()
la_df['grade'] = pd.Categorical(la_df['grade'], categories=['A','B','C','D'], ordered=True)

print(la_df.head())
print(la_df.describe(include='all'))

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# Baseline regression: Predict Total_Adv_STEM using HOLC grade only (as ordered categorical) via OLS.
# Then produce visualizations: mosaic/stacked bar, bar/point by grade, distributions, and scatter with jitter.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Use existing la_df in memory

# Simple baseline: encode grade as ordered categorical with integer codes for OLS
reg_df = la_df.copy()
reg_df = reg_df.dropna(subset=['grade'])
reg_df['grade_code'] = reg_df['grade'].cat.codes  # A=0, B=1, C=2, D=3

# OLS with intercept: Total_Adv_STEM ~ grade_code
model = smf.ols('Total_Adv_STEM ~ grade_code', data=reg_df).fit(cov_type='HC1')
print(model.summary())

# Stacked bar chart: fraction of schools with Has_Calc/CS/Physics by grade
flag_map = {'Yes': 1, 'No': 0}
flag_df = la_df.copy()
for col in ['Has_Calc','Has_CS','Has_Physics']:
    flag_df[col] = flag_df[col].map(flag_map)

share_by_grade = flag_df.groupby('grade')[['Has_Calc','Has_CS','Has_Physics']].mean().reindex(['A','B','C','D'])

# Plot stacked bar of shares
ax = share_by_grade.plot(kind='bar', stacked=True, figsize=(8,5), colormap='tab20')
plt.title('Share of Schools Offering Calc/CS/Physics by HOLC Grade')
plt.ylabel('Share (stacked across subjects)')
plt.xlabel('HOLC Grade')
plt.legend(title='Subject', bbox_to_anchor=(1.02,1), loc='upper left')
plt.tight_layout()
plt.show()

# Bar/point plots by HOLC grade for average counts
avg_counts = la_df.groupby('grade')[['Num_Calc_Classes','Num_CS_Classes','Num_Physics_Classes','Total_Adv_STEM']].mean().reset_index()

plt.figure(figsize=(8,5))
sns.pointplot(data=avg_counts, x='grade', y='Total_Adv_STEM')
plt.title('Average Total Advanced STEM by HOLC Grade')
plt.ylabel('Avg Total Classes')
plt.xlabel('HOLC Grade')
plt.show()

plt.figure(figsize=(8,5))
plot_df = la_df.melt(id_vars='grade', value_vars=['Num_Calc_Classes','Num_CS_Classes','Num_Physics_Classes'], var_name='Subject', value_name='Classes')
sns.barplot(data=plot_df, x='grade', y='Classes', hue='Subject', estimator=np.mean, errorbar='se')
plt.title('Average Classes by Subject and HOLC Grade')
plt.ylabel('Average number of classes')
plt.xlabel('HOLC Grade')
plt.legend(title='Subject')
plt.show()

# Distribution plots: histogram + KDE of Total_Adv_STEM and per-subject counts
plt.figure(figsize=(8,5))
sns.histplot(la_df['Total_Adv_STEM'], bins=20, kde=True)
plt.title('Distribution of Total Advanced STEM Classes')
plt.xlabel('Total Advanced STEM')
plt.ylabel('Count of schools')
plt.show()

plt.figure(figsize=(8,5))
sns.kdeplot(data=plot_df, x='Classes', hue='Subject', common_norm=False, fill=True)
plt.title('Distribution of Class Counts by Subject')
plt.xlabel('Number of classes')
plt.show()

# Scatter plot with jitter: Total_Adv_STEM vs grade (jitter grade on x)
np.random.seed(0)
x_jitter = reg_df['grade_code'] + np.random.uniform(-0.1, 0.1, size=reg_df.shape[0])
plt.figure(figsize=(8,5))
plt.scatter(x_jitter, reg_df['Total_Adv_STEM'], alpha=0.6)
plt.xticks([0,1,2,3], ['A','B','C','D'])
plt.xlabel('HOLC Grade')
plt.ylabel('Total Advanced STEM')
plt.title('Total Advanced STEM by HOLC Grade (with jitter)')
plt.show()