# Logic for open/closed school dates
- If there is "No Data" assume the school was open

In [1]:
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [2]:
path_source = 'local'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  gdrive_path = '/content/gdrive/MyDrive/Classes/W210_capstone'
  env_path = '/content/gdrive/MyDrive/.env'
  
elif path_source == 'local':
  gdrive_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/MyDrive/.env'

elif path_source == 'work':
  gdrive_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/.env'

In [3]:
df_all = pd.read_parquet(os.path.join(gdrive_path, 'W210_Capstone/Data/joined_data/joined_data.parquet'))

In [4]:
# some schools dont have wind data, and thus didn't get mapped to a nearest pollution source
df_counts = df_all['cdscode'].value_counts().to_frame().reset_index()
df_counts

Unnamed: 0,index,cdscode
0,1100170000000,240
1,36679343638012,240
2,36679346059562,240
3,36679346114698,240
4,36679590000000,240
...,...,...
13292,24658700111294,1
13293,24658706025829,1
13294,24658706025837,1
13295,24658706109425,1


In [5]:
# the schools that weren't joined
df_all[df_all['cdscode'].isin(df_counts[df_counts['cdscode'] == 1]['index'].to_list())].head(3)

Unnamed: 0,cdscode,school_active_status,school_county,school_street,school_zip,school_open_date,school_closed_date,school_type,school_grades_offered,school_lat,school_lon,school_last_updated_date,wind_lat,wind_lon,ZCTA10,u,v,wdir,wspd,year_month,year,zip,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,pm25,pollution_source_id,pollution_source_lat,pollution_source_lon,PM25_emissions_TPY,pollution_school_distance
480,1100170112607,Active,Alameda,1515 Webster Street,94612,8/28/2006,No Data,Elementary-High Combination,4-12,37.80452,-122.26815,7/18/2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2881,1100170136101,Active,Alameda,703 C Street,94587,8/16/2017,No Data,Elementary-High Combination,K-12,37.603623,-122.0253,8/13/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
38162,1611920131334,Closed,Alameda,2021 Highland Boulevard,94540,8/21/2014,7/30/2021,Elementary,K-12,37.661939,-122.05792,9/22/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
# func to fill in the dates
def date_fill(x, start_or_end='start'):
  if x == 'No Data' and start_or_end == 'start':
    output = '1/1/1990'
  elif x == 'No Data' and start_or_end == 'end':
    output = '1/1/2025'
  else:
    output = x 

  return output

In [7]:
# just filter out NAs for now
df_all = df_all[~df_all['year_month'].isna()]

In [8]:
tqdm.pandas()

# for all missing dates, just assume the school was always open. 
df_all['school_open_date'] = df_all['school_open_date'].progress_map(lambda x: date_fill(x, 'start'))
df_all['school_closed_date'] = df_all['school_closed_date'].progress_map(lambda x: date_fill(x, 'end'))

# convert to datetime format
df_all['school_open_date'] = df_all['school_open_date'].progress_map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df_all['school_closed_date'] = df_all['school_closed_date'].progress_map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

# do the same for the year_month field. But use this function in case there are NAs
def strptime2(x):
  try: 
    return datetime.strptime(x, '%Y-%m')
  except:
    return None

df_all['year_month_date'] = df_all['year_month'].progress_map(strptime2)

  0%|          | 0/2982240 [00:00<?, ?it/s]

  0%|          | 0/2982240 [00:00<?, ?it/s]

  0%|          | 0/2982240 [00:00<?, ?it/s]

  0%|          | 0/2982240 [00:00<?, ?it/s]

  0%|          | 0/2982240 [00:00<?, ?it/s]

In [9]:
# create indicator if school was open during this date
df_all[['school_open_date', 'school_closed_date', 'year_month_date']].head(5)

Unnamed: 0,school_open_date,school_closed_date,year_month_date
0,1990-01-01,2025-01-01,2000-01-01
1,1990-01-01,2025-01-01,2000-02-01
2,1990-01-01,2025-01-01,2000-03-01
3,1990-01-01,2025-01-01,2000-04-01
4,1990-01-01,2025-01-01,2000-05-01


In [10]:
df_all[['school_open_date', 'school_closed_date', 'year_month_date']].dtypes

school_open_date      datetime64[ns]
school_closed_date    datetime64[ns]
year_month_date       datetime64[ns]
dtype: object

In [11]:
def is_school_open(start, end, current):
  if (start <= current) and (end >= current):
    return 1 
  else: 
    return 0 

df_all['is_school_open'] = df_all\
  .progress_apply(lambda df: is_school_open(df['school_open_date'], df['school_closed_date'], df['year_month_date']), axis=1)

  0%|          | 0/2982240 [00:00<?, ?it/s]

In [13]:
df_all['is_school_open'].value_counts()

1    2471552
0     510688
Name: is_school_open, dtype: int64

In [16]:
# filter out bad obs
df_all = df_all[df_all['is_school_open'] != 0]