In [1]:
# !pip install geopy
# !pip install boto3
# !pip install requests
# !pip install mysql-connector-python
# !pip install arcgis
# !pip install -U SQLAlchemy

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import geopy as gp
import os
from dotenv import load_dotenv
import json
import requests
import zipfile
import boto3
import botocore
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from io import StringIO, BytesIO
from sqlalchemy import create_engine
from zipfile import ZipFile, BadZipFile

In [3]:
from arcgis.geocoding import reverse_geocode
from arcgis.geometry import Geometry
from arcgis.gis import GIS
import pandas as pd

#method to get values of environment varaibles from .env file 
load_dotenv('.env')

GIS_URL = os.getenv("GIS_URL")
GIS_USER = os.getenv("GIS_USER")
GIS_PASSWORD = os.getenv("GIS_PASSWORD")

gis = GIS(GIS_URL, GIS_USER, GIS_PASSWORD)

def get_zip(df, lon_field, lat_field):
    location = reverse_geocode((Geometry({"x":float(df[lon_field]), "y":float(df[lat_field]), "spatialReference":{"wkid": 4326}})))
    return location['address']['Postal']

In [4]:
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

# Function to download a ZIP file from a GitHub repository
def download_zip_from_github(github_url):
    try:
        response = requests.get(github_url)
        response.raise_for_status()  # Check if the request was successful
        return response.content  # Return the ZIP content as bytes
    except requests.exceptions.RequestException as e:
        print(f"Error downloading ZIP from GitHub: {e}")
        return None
    
# Function to extract a ZIP file and return the list of extracted files
def extract_zip(zip_content):
    extract_to="temp_extracted_files"
    try:
        # Create a BytesIO object from the ZIP content
        zip_buffer = BytesIO(zip_content)

        # Create the extraction directory if it doesn't exist
        if not os.path.exists(extract_to):
            os.makedirs(extract_to)

        # Open and extract the ZIP file
        with zipfile.ZipFile(zip_buffer, "r") as zip_ref:
            zip_ref.extractall(extract_to)

        # List the extracted files
        return [os.path.join(extract_to, f) for f in os.listdir(extract_to)]
    
    except zipfile.BadZipFile:
        print("Error: The ZIP content is not a valid ZIP file.")
        return []
    except Exception as e:
        print(f"Error extracting ZIP file: {e}")
        return []



# Function to upload a folder to an S3 bucket
def upload_folder_to_s3(folder_path, bucket_name, s3_folder_name=None):
    # Initialize the S3 client
    s3 = boto3.client("s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY)

    # Walk through the folder and upload each file
    for root, _, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)

            # If a specific S3 folder name is provided, use it; otherwise, use the same structure
            if s3_folder_name:
                # Construct the object key (S3 path) relative to the specified folder
                object_key = os.path.join(s3_folder_name, os.path.relpath(file_path, folder_path))
            else:
                # Default object key using the folder structure from the given folder_path
                object_key = os.path.relpath(file_path, folder_path)

            # Normalize object_key for consistent S3 path format
            object_key = object_key.replace("\\", "/")

            try:
                # Upload the file to S3
                s3.upload_file(file_path, bucket_name, object_key)
                print(f"File '{file_path}' uploaded to '{bucket_name}/{object_key}'.")

            except NoCredentialsError:
                print("AWS credentials not found. Please configure your AWS credentials.")
            except PartialCredentialsError:
                print("Incomplete AWS credentials. Please check your configuration.")
            except Exception as e:
                print(f"Error uploading file '{file_path}' to S3: {e}")

# Validate the files uploaded in S3 bucket        
def print_bucket_content(bucketname):
    s3 = boto3.client("s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY)
    response = s3.list_objects_v2(Bucket = bucketname)
    if 'Contents' in response:
        print('Contents of the {} bucket'.format(bucketname))
        for obj in response['Contents']:
            print(obj['Key'])
    else:
        print('bucket is empty')
        
# Function to read all files from an S3 bucket and convert to DataFrames
def read_s3_files_as_dataframes(bucket_name, s3_folder=None, file_extension=".csv"):
    s3 = boto3.client("s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY)

    dataframes = []

    try:
        # List objects in the specified S3 bucket (and folder, if specified)
        if s3_folder:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name)

        # Check if the bucket contains files
        if "Contents" not in response:
            print(f"No files found in the specified S3 '{bucket_name}'/'{s3_folder}'.")
            return dataframes

        # Iterate over the objects in the bucket
        for item in response["Contents"]:
            key = item["Key"]

            # Filter by file extension (if required)
            if key.endswith(file_extension):
                # Get the file content from S3
                file_obj = s3.get_object(Bucket=bucket_name, Key=key)

                # Read the content into a pandas DataFrame
                file_content = file_obj["Body"].read().decode("utf-8")
                dataframe = pd.read_csv(StringIO(file_content))

                # Store the DataFrame
                dataframes.append(dataframe)
                print(f"File '{key}' read and stored as DataFrame.")

    except NoCredentialsError:
        print("AWS credentials not found. Please configure your AWS credentials.")
    except PartialCredentialsError:
        print("Incomplete AWS credentials. Please check your configuration.")
    except Exception as e:
        print(f"Error reading from S3: {e}")

    return dataframes

# Air Quality Data

In [5]:
# GitHub repository URL
github_repo_url = "https://github.com/mimam786/InformationArchitecture/raw/main/Air_Quality_Data.zip"  # Update with repo URL

# Load Air Quality data folder into S3 bucket
folder_path = "Air_Quality_Data"  # Path to the folder with files to upload
bucket_name = "mission-green-bucket"  # Name of the S3 bucket

# Step 1: Download the ZIP file from GitHub
zip_content = download_zip_from_github(github_repo_url)

# Step 2: Extract the ZIP content and get the list of extracted files
if zip_content:
    extracted_files = extract_zip(zip_content)
    print(extracted_files)
    # Step 3: Upload the extracted files to the S3 bucket
    for i in range(len(extracted_files)):
        s3_folder_name = "Air_Quality_Data"  # Optional, to set a specific S3 folder structure
        folder_name = extracted_files[i].split('\\')[1]
        s3_folder_name = s3_folder_name + '\\' + folder_name
        upload_folder_to_s3(extracted_files[i], bucket_name, s3_folder_name)

['temp_extracted_files\\CO', 'temp_extracted_files\\NO2', 'temp_extracted_files\\Ozone', 'temp_extracted_files\\PM10', 'temp_extracted_files\\PM2.5', 'temp_extracted_files\\SO2']
File 'temp_extracted_files\CO\ad_viz_plotval_data_CO_2020.csv' uploaded to 'mission-green-bucket/Air_Quality_Data/CO/ad_viz_plotval_data_CO_2020.csv'.
File 'temp_extracted_files\CO\ad_viz_plotval_data_CO_2021.csv' uploaded to 'mission-green-bucket/Air_Quality_Data/CO/ad_viz_plotval_data_CO_2021.csv'.
File 'temp_extracted_files\CO\ad_viz_plotval_data_CO_2022.csv' uploaded to 'mission-green-bucket/Air_Quality_Data/CO/ad_viz_plotval_data_CO_2022.csv'.
File 'temp_extracted_files\CO\ad_viz_plotval_data_CO_2023.csv' uploaded to 'mission-green-bucket/Air_Quality_Data/CO/ad_viz_plotval_data_CO_2023.csv'.
File 'temp_extracted_files\NO2\ad_viz_plotval_data_NO2_2020.csv' uploaded to 'mission-green-bucket/Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2020.csv'.
File 'temp_extracted_files\NO2\ad_viz_plotval_data_NO2_2021.cs

In [6]:
# Validate the files stored in s3 bucket
print_bucket_content("mission-green-bucket")

Contents of the mission-green-bucket bucket
Air_Quality_Data/CO/ad_viz_plotval_data_CO_2020.csv
Air_Quality_Data/CO/ad_viz_plotval_data_CO_2021.csv
Air_Quality_Data/CO/ad_viz_plotval_data_CO_2022.csv
Air_Quality_Data/CO/ad_viz_plotval_data_CO_2023.csv
Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2020.csv
Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2021.csv
Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2022.csv
Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2023.csv
Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2020.csv
Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2021.csv
Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2022.csv
Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2023.csv
Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2024.csv
Air_Quality_Data/PM10/ad_viz_plotval_data_PM10_2020.csv
Air_Quality_Data/PM10/ad_viz_plotval_data_PM10_2021.csv
Air_Quality_Data/PM10/ad_viz_plotval_data_PM10_2022.csv
Air_Quality_Data/PM10/ad_viz_plotval_data_PM10_2023.csv
Air_Quality_Data/P

In [7]:
# Example S3 bucket information

def read_data_from_folder(bucket_name, s3_folder):
    # Read all files from the S3 bucket and store them as DataFrames
    dataframes = read_s3_files_as_dataframes(bucket_name, s3_folder)
    
    final_df = pd.DataFrame(columns=['Date', 'Source', 'Site ID', 'POC', 'Gases_Concentration',
       'UNITS', 'DAILY_AQI_VALUE', 'Site Name', 'DAILY_OBS_COUNT',
       'PERCENT_COMPLETE', 'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC',
       'CBSA_CODE', 'CBSA_NAME', 'STATE_CODE', 'STATE', 'COUNTY_CODE',
       'COUNTY', 'SITE_LATITUDE', 'SITE_LONGITUDE'])
    for i, df in enumerate(dataframes):
        arr1 = final_df.to_numpy()
        arr2 = df.to_numpy()
        result = np.concatenate([arr1, arr2])
        final_df = pd.DataFrame(result)
    
    final_df.columns = ['Date', 'Source', 'Site ID', 'POC', 'Gases_Concentration',
       'UNITS', 'DAILY_AQI_VALUE', 'Site Name', 'DAILY_OBS_COUNT',
       'PERCENT_COMPLETE', 'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC',
       'CBSA_CODE', 'CBSA_NAME', 'STATE_CODE', 'STATE', 'COUNTY_CODE',
       'COUNTY', 'SITE_LATITUDE', 'SITE_LONGITUDE']
    return final_df

In [8]:
df_CO = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/CO')
df_NO2 = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/NO2')
df_Ozone = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/Ozone')
df_PM10 = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/PM10')
df_PM2 = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/PM2.5')
df_SO2 = read_data_from_folder('mission-green-bucket', 'Air_Quality_Data/SO2')

File 'Air_Quality_Data/CO/ad_viz_plotval_data_CO_2020.csv' read and stored as DataFrame.
File 'Air_Quality_Data/CO/ad_viz_plotval_data_CO_2021.csv' read and stored as DataFrame.
File 'Air_Quality_Data/CO/ad_viz_plotval_data_CO_2022.csv' read and stored as DataFrame.
File 'Air_Quality_Data/CO/ad_viz_plotval_data_CO_2023.csv' read and stored as DataFrame.
File 'Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2020.csv' read and stored as DataFrame.
File 'Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2021.csv' read and stored as DataFrame.
File 'Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2022.csv' read and stored as DataFrame.
File 'Air_Quality_Data/NO2/ad_viz_plotval_data_NO2_2023.csv' read and stored as DataFrame.
File 'Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2020.csv' read and stored as DataFrame.
File 'Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2021.csv' read and stored as DataFrame.
File 'Air_Quality_Data/Ozone/ad_viz_plotval_data_Ozone_2022.csv' read and stored as DataFr

In [9]:
merged_df = pd.concat([df_CO, df_NO2, df_Ozone, df_PM10, df_PM2, df_SO2], axis = 0)
merged_df

Unnamed: 0,Date,Source,Site ID,POC,Gases_Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
0,01/01/2020,AQS,360050133,1,0.2,ppm,2,PFIZER LAB SITE,19,79.0,42101,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809
1,01/02/2020,AQS,360050133,1,0.5,ppm,6,PFIZER LAB SITE,24,100.0,42101,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809
2,01/03/2020,AQS,360050133,1,0.6,ppm,7,PFIZER LAB SITE,24,100.0,42101,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809
3,01/04/2020,AQS,360050133,1,0.7,ppm,8,PFIZER LAB SITE,24,100.0,42101,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809
4,01/05/2020,AQS,360050133,1,0.3,ppm,3,PFIZER LAB SITE,24,100.0,42101,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21248,12/27/2023,AQS,361010003,2,0.0,ppb,0,PINNACLE STATE PARK,22,92.0,42401,Sulfur dioxide,18500.0,"Corning, NY",36,New York,101,Steuben,42.09142,-77.20978
21249,12/28/2023,AQS,361010003,2,0.0,ppb,0,PINNACLE STATE PARK,22,92.0,42401,Sulfur dioxide,18500.0,"Corning, NY",36,New York,101,Steuben,42.09142,-77.20978
21250,12/29/2023,AQS,361010003,2,0.0,ppb,0,PINNACLE STATE PARK,24,100.0,42401,Sulfur dioxide,18500.0,"Corning, NY",36,New York,101,Steuben,42.09142,-77.20978
21251,12/30/2023,AQS,361010003,2,0.0,ppb,0,PINNACLE STATE PARK,24,100.0,42401,Sulfur dioxide,18500.0,"Corning, NY",36,New York,101,Steuben,42.09142,-77.20978


In [10]:
coordinates = merged_df[['SITE_LATITUDE','SITE_LONGITUDE']]
coordinates

Unnamed: 0,SITE_LATITUDE,SITE_LONGITUDE
0,40.8679,-73.87809
1,40.8679,-73.87809
2,40.8679,-73.87809
3,40.8679,-73.87809
4,40.8679,-73.87809
...,...,...
21248,42.09142,-77.20978
21249,42.09142,-77.20978
21250,42.09142,-77.20978
21251,42.09142,-77.20978


In [11]:
# Remove duplicate combinations of latitude and longitude
unique_coordinates = coordinates.drop_duplicates()

# Display the DataFrame with unique combinations
print(unique_coordinates)

      SITE_LATITUDE SITE_LONGITUDE
0           40.8679      -73.87809
356       42.876907     -78.809526
694       42.921107     -78.765825
1053      43.145013      -77.55728
1399       43.14618      -77.54817
1765       40.81976      -73.94825
2130       40.73614      -73.82153
2475      40.739264     -73.817694
2800       42.09142      -77.20978
6344      40.961017     -73.139046
0            40.816        -73.902
0          42.68075      -73.75733
1018       42.49963      -79.31881
1379       41.78555      -73.74136
1708       42.99328      -78.77153
2041       44.36608      -73.90312
2362       44.39308       -73.8589
2683        43.9731       -74.2232
3029       43.44957      -74.51625
3383       44.08747      -75.97316
4455       43.22386      -78.47888
4811       43.05235      -76.05921
5165       41.52375      -74.21534
5487       43.28428      -76.46324
5842       41.45589      -73.70977
6494       40.58027      -74.19832
6741       41.18208      -74.02819
7100       43.01209 

In [12]:
geolocator = gp.Nominatim(user_agent="zipcode_finder")

def get_zipcode(latitude, longitude):  # Provide a valid user agent
    location = geolocator.reverse((latitude, longitude))
    address = location.raw['address']
    zipcode = address.get('postcode')
    if zipcode:
        return zipcode
    else:
        return '12852'


In [13]:
unique_coordinates['zipcode'] = unique_coordinates.apply(get_zip, axis=1, lat_field='SITE_LATITUDE', lon_field='SITE_LONGITUDE')

In [14]:
unique_coordinates

Unnamed: 0,SITE_LATITUDE,SITE_LONGITUDE,zipcode
0,40.8679,-73.87809,10458.0
356,42.876907,-78.809526,14206.0
694,42.921107,-78.765825,14225.0
1053,43.145013,-77.55728,14610.0
1399,43.14618,-77.54817,14610.0
1765,40.81976,-73.94825,10031.0
2130,40.73614,-73.82153,11367.0
2475,40.739264,-73.817694,11367.0
2800,42.09142,-77.20978,14801.0
6344,40.961017,-73.139046,11733.0


In [15]:
merged_df = pd.merge(merged_df, unique_coordinates, on=['SITE_LATITUDE', 'SITE_LONGITUDE'],how='inner')
merged_df

Unnamed: 0,Date,Source,Site ID,POC,Gases_Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,...,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE,zipcode
0,01/01/2020,AQS,360050133,1,0.2,ppm,2,PFIZER LAB SITE,19,79.0,...,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809,10458
1,01/02/2020,AQS,360050133,1,0.5,ppm,6,PFIZER LAB SITE,24,100.0,...,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809,10458
2,01/03/2020,AQS,360050133,1,0.6,ppm,7,PFIZER LAB SITE,24,100.0,...,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809,10458
3,01/04/2020,AQS,360050133,1,0.7,ppm,8,PFIZER LAB SITE,24,100.0,...,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809,10458
4,01/05/2020,AQS,360050133,1,0.3,ppm,3,PFIZER LAB SITE,24,100.0,...,Carbon monoxide,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.8679,-73.87809,10458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141671,12/27/2023,AQS,360890005,1,1.7,ppb,1,Alcoa East,23,96.0,...,Sulfur dioxide,36300.0,"Ogdensburg-Massena, NY",36,New York,89,St. Lawrence,44.965412,-74.875,13662
141672,12/28/2023,AQS,360890005,1,2.3,ppb,3,Alcoa East,24,100.0,...,Sulfur dioxide,36300.0,"Ogdensburg-Massena, NY",36,New York,89,St. Lawrence,44.965412,-74.875,13662
141673,12/29/2023,AQS,360890005,1,1.9,ppb,1,Alcoa East,23,96.0,...,Sulfur dioxide,36300.0,"Ogdensburg-Massena, NY",36,New York,89,St. Lawrence,44.965412,-74.875,13662
141674,12/30/2023,AQS,360890005,1,1.0,ppb,1,Alcoa East,24,100.0,...,Sulfur dioxide,36300.0,"Ogdensburg-Massena, NY",36,New York,89,St. Lawrence,44.965412,-74.875,13662


In [16]:
from datetime import datetime
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

In [17]:
merged_df['Date']

0        2020-01-01
1        2020-01-02
2        2020-01-03
3        2020-01-04
4        2020-01-05
            ...    
141671   2023-12-27
141672   2023-12-28
141673   2023-12-29
141674   2023-12-30
141675   2023-12-31
Name: Date, Length: 141676, dtype: datetime64[ns]

# Store the Air data into RDS

In [18]:
# Connection details for RDS
db_type = "mysql"
db_user = ""
db_password = ""
db_host = "database-1.cnq4kwysqjdb.us-east-1.rds.amazonaws.com"  # RDS endpoint
db_port = 3306 # Default port
db_name = "mission_green"

# Create a database engine using SQLAlchemy
connection_string = f"{db_type}://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection_string)

# Write DataFrame to the RDS table
table_name1 = "Air_Quality_Data"  # Name of the RDS table to write to

# Write DataFrame to the SQL database
merged_df.to_sql(table_name1, engine, if_exists="replace", index=False)  # index=False to avoid creating index column

print(f"DataFrame written to {table_name1} in RDS database.")

DataFrame written to Air_Quality_Data in RDS database.
