# EDA on Full dataset

Do some brief EDA and quality checks on our joined dataset

In [2]:
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px


pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
path_source = 'local'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  gdrive_path = '/content/gdrive/MyDrive/Classes/W210_capstone'
  env_path = '/content/gdrive/MyDrive/.env'
  
elif path_source == 'local':
  gdrive_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/MyDrive/.env'

elif path_source == 'work':
  gdrive_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/.env'

In [4]:
df_all = pd.read_parquet(os.path.join(gdrive_path, 'W210_Capstone/Data/joined_data/joined_data.parquet'))

In [5]:
# some schools dont have wind data, and thus didn't get mapped to a nearest pollution source
df_counts = df_all['cdscode'].value_counts().to_frame().reset_index()
df_counts

Unnamed: 0,index,cdscode
0,1100170000000,240
1,36679343638012,240
2,36679346059562,240
3,36679346114698,240
4,36679590000000,240
...,...,...
13292,24658700111294,1
13293,24658706025829,1
13294,24658706025837,1
13295,24658706109425,1


In [6]:
# the schools that weren't joined
df_all[df_all['cdscode'].isin(df_counts[df_counts['cdscode'] == 1]['index'].to_list())]

Unnamed: 0,cdscode,school_active_status,school_county,school_street,school_zip,school_open_date,school_closed_date,school_type,school_grades_offered,school_lat,school_lon,school_last_updated_date,wind_lat,wind_lon,ZCTA10,u,v,wdir,wspd,year_month,year,zip,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,pm25,pollution_source_id,pollution_source_lat,pollution_source_lon,PM25_emissions_TPY,pollution_school_distance
480,1100170112607,Active,Alameda,1515 Webster Street,94612,8/28/2006,No Data,Elementary-High Combination,4-12,37.804520,-122.26815,7/18/2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2881,1100170136101,Active,Alameda,703 C Street,94587,8/16/2017,No Data,Elementary-High Combination,K-12,37.603623,-122.02530,8/13/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
38162,1611920131334,Closed,Alameda,2021 Highland Boulevard,94540,8/21/2014,7/30/2021,Elementary,K-12,37.661939,-122.05792,9/22/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
38403,1611920133520,Active,Alameda,22100 Princeton Street,94540,7/1/1980,No Data,Adult,Adult,37.672622,-122.09814,8/16/2019,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
42004,1611926001101,Active,Alameda,411 Larchmont Street,94540,7/1/1980,No Data,Elementary,K-6,37.652934,-122.09406,7/12/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895746,56724700000000,Active,Ventura,3901 North Mesa School Road,93066,No Data,No Data,No Data,No Data,34.262484,-119.09424,7/1/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2896227,56724706055123,Active,Ventura,3901 North Mesa School Road,93066,7/1/1980,No Data,Elementary,K-8,34.262484,-119.09424,8/13/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2925508,56726110000000,Active,Ventura,5268 North Street,93066,No Data,No Data,No Data,No Data,34.258820,-118.99756,8/5/2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2925749,56726116055834,Active,Ventura,5268 North Street,93066,7/1/1980,No Data,Elementary,K-8,34.258820,-118.99756,1/21/2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Logic for open/closed school dates
- If there is "No Data" assume the school was open

In [7]:
def date_fill(x, start_or_end='start'):
  if x == 'No Data' and start_or_end == 'start':
    output = '1/1/1990'
  elif x == 'No Data' and start_or_end == 'end':
    output = '1/1/2025'
  else:
    output = x 

  return output

In [None]:
df_all['school_open_date'] = df_all['school_open_date'].map(lambda x: date_fill(x, 'start'))
df_all['school_closed_date'] = df_all['school_closed_date'].map(lambda x: date_fill(x, 'end'))