# ETL Code

In [2]:
# Importing dependencies
import pandas as pd, requests
from michelle_config import WEATHER_API_KEY
import time
from datetime import datetime

ModuleNotFoundError: No module named 'michelle_config'

In [None]:
# Reading file containing latitudes and longitudes of all states
states_df = pd.read_excel('Resources/All_States.xlsx')
states_df.head()

In [None]:
# Selecting our 4 prefered states: AK, CA, DC and MA
pref_states = states_df[states_df['state'].isin(['DC','CA','MA', 'AK'])]
pref_states

In [None]:
# Creating the urls of the 4 states
# Start date set as 1606453200 or 11/27/20 5am (earliest historical data available in OpenWeather for AQI)
# End date set as 1631336400 or 9/11/21 5am
for index, row in pref_states.iterrows():
    url = "http://api.openweathermap.org/data/2.5/air_pollution/history?lat=" + str(row[1]) + "&lon=" + str(row[2]) + "&start=1606453200&end=1630468800"  + "&appid=" + WEATHER_API_KEY
    response = requests.get(url).json()
    print(url)
    # print(response)

In [None]:
# Making get requests and creating json files for 4 states
ak_data = requests.get('http://api.openweathermap.org/data/2.5/air_pollution/history?lat=63.588753&lon=-154.493062&start=1606453200&end=1630468800&appid=b4f0b753cb08149f9b346a17c5fb0ff7').json()
ma_data = requests.get('http://api.openweathermap.org/data/2.5/air_pollution/history?lat=42.407211&lon=-71.382437&start=1606453200&end=1630468800&appid=b4f0b753cb08149f9b346a17c5fb0ff7').json()
dc_data = requests.get('http://api.openweathermap.org/data/2.5/air_pollution/history?lat=38.905985&lon=-77.033418&start=1606453200&end=1630468800&appid=b4f0b753cb08149f9b346a17c5fb0ff7').json()
ca_data = requests.get('http://api.openweathermap.org/data/2.5/air_pollution/history?lat=36.778261&lon=-119.417932&start=1606453200&end=1630468800&appid=b4f0b753cb08149f9b346a17c5fb0ff7').json()

In [None]:
# Creating AK dataframe & converting time to correct format
ak = pref_states.iloc[[0]]
ak_frame = pd.json_normalize(ak_data,record_path=['list'])
ak_frame['State'] = ak['state'].iloc[0]
ak_frame['Lat'] = ak['latitude'].iloc[0]
ak_frame['Long']= ak['longitude'].iloc[0]
ak_frame['date']=[datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S') for date in ak_frame['dt']]
ak_frame.head()

In [None]:
# Creating CA dataframe & converting time to correct format
ca = pref_states.iloc[[1]]
ca_frame = pd.json_normalize(ca_data,record_path=['list'])
ca_frame['State'] = ca['state'].iloc[0]
ca_frame['Lat'] = ca['latitude'].iloc[0]
ca_frame['Long']= ca['longitude'].iloc[0]
ca_frame['date']=[datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S') for date in ca_frame['dt']]
ca_frame.head()

In [None]:
# Creating DC dataframe & converting time to correct format
dc = pref_states.iloc[[2]]
dc_frame = pd.json_normalize(dc_data,record_path=['list'])
dc_frame['State'] = dc['state'].iloc[0]
dc_frame['Lat'] = dc['latitude'].iloc[0]
dc_frame['Long']= dc['longitude'].iloc[0]
dc_frame['date']=[datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S') for date in dc_frame['dt']]
dc_frame.head()

In [None]:
# Creating MA dataframe & converting time to correct format
ma = pref_states.iloc[[3]]
ma_frame = pd.json_normalize(ma_data,record_path=['list'])
ma_frame['State'] = ma['state'].iloc[0]
ma_frame['Lat'] = ma['latitude'].iloc[0]
ma_frame['Long']= ma['longitude'].iloc[0]
ma_frame['date']=[datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S') for date in ma_frame['dt']]
ma_frame.head()

In [None]:
# Combining all 4 dataframes 
AQI_data=pd.concat([ak_frame, dc_frame, ca_frame, ma_frame])
AQI_data.columns

In [None]:
# Renaming columns
AQI_data_Renamed = AQI_data.rename(columns = 
                {"date": "Date",
                 "State": "State",
                 "Lat": "Latitude",
                 "Long": "Longitude",
                 "main.aqi": "AQI",
                 "components.co": "CO",
                 "components.no": "NO",
                 "components.no2": "NO2",
                 "components.o3": "O3",
                 "components.so2": "SO2",
                 "components.pm2_5": "PM2_5",
                 "components.pm10": "PM10",
                 "components.nh3": "NH3",
                 "dt": "dt"})
AQI_data_Renamed.head()

In [None]:
# Reordering the columns
AQI_data_Renamed = AQI_data_Renamed[["Date",
                                     "State",
                                     "Latitude",
                                     "Longitude",
                                     "AQI",
                                     "CO",
                                     "NO",
                                     "NO2",
                                     "O3",
                                     "SO2",
                                     "PM2_5",
                                     "PM10",
                                     "NH3",
                                     "dt"]]
AQI_data_Renamed.head()

In [None]:
# Converting & saving to a CSV file for visualization in Tableau and ML Model
AQI_data_Renamed.to_csv('Resources/AQI_data.csv', index=False)

In [None]:
# Converting & saving to a CSV file for ML Model Impact Testing While Scaled
ma_frame.to_csv('Resources/MA_data.csv', index=False)
dc_frame.to_csv('Resources/DC_data.csv', index=False)
ak_frame.to_csv('Resources/AK_data.csv', index=False)
ca_frame.to_csv('Resources/CA_data.csv', index=False)