In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd, datetime
import numpy as np
import requests
import time
from scipy.stats import linregress
from ydata_profiling import ProfileReport
from pathlib import Path


# Import the OpenWeatherMap API key
from api_keys import airpollution_api_key 


# Import citipy to determine the cities based on latitude and longitude
from citipy import citipy

airpollution_api_key = airpollution_api_key or "YOUR_API_KEY"
print(airpollution_api_key)

cffa2b2ac0605c550af5b2dd12540b15


In [2]:
# Load the CSV file into a Pandas DataFrame
ntn_metadata = pd.read_csv("../Resources/cleaned-ntn-metadata.csv")
ntn_all = pd.read_csv("../Resources/cleaned-NTN-ALL-a-s-ueq.csv")

# Combine the data into a single dataset on the 'siteID' column
ntn_data = pd.merge(ntn_metadata, ntn_all, how="left", on="siteID")
ntn_data.head()



Unnamed: 0,network,siteID,siteName,status,startDate_x,stopDate,county,state,latitude,longitude,...,Cl,SO4,H,Conduc,svol,ppt,fullChemLab,daysSample,startDate_y,lastDate
0,NTN,AK01,Poker Creek,A,12/29/92,,Fairbanks North Star,AK,65.155,-147.491,...,1.15661,2.60375,5.984,3.488,17184.5,32.619,29.0,371.0,Dec 28 2021 10:50PM,Jan 3 2023 11:00PM
1,NTN,AK02,Juneau,A,6/22/04,,Juneau,AK,58.5139,-134.7843,...,5.67021,2.22881,4.656,3.124,140951.0,246.034,46.0,371.0,Dec 28 2021 11:19PM,Jan 3 2023 8:42PM
2,NTN,AK96,Toolik Field Station,A,10/12/17,,North Slope Borough,AK,68.6257,-149.6069,...,1.72081,2.87454,5.689,3.668,13546.8,31.318,35.0,371.0,Dec 28 2021 8:45PM,Jan 3 2023 6:40PM
3,NTN,AK97,Katmai National Park - King Salmon,A,11/2/09,,Bristol Bay,AK,58.6794,-156.6664,...,13.23049,5.43663,7.211,5.246,47153.99,87.417,42.0,372.0,Dec 28 2021 7:30PM,Jan 4 2023 7:54PM
4,NTN,AL10,Black Belt Research & Extension Center,A,8/31/83,,Dallas,AL,32.4583,-87.2422,...,9.5914,8.62362,5.495,5.877,92996.8,153.01,37.0,371.0,Dec 28 2021 3:00PM,Jan 3 2023 3:45PM


In [4]:
# review data types
ntn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   network      162 non-null    object 
 1   siteID       162 non-null    object 
 2   siteName     162 non-null    object 
 3   status       162 non-null    object 
 4   startDate_x  162 non-null    object 
 5   stopDate     2 non-null      object 
 6   county       162 non-null    object 
 7   state        162 non-null    object 
 8   latitude     162 non-null    float64
 9   longitude    162 non-null    float64
 10  elevation    161 non-null    float64
 11  stateName    162 non-null    object 
 12  siteClass    162 non-null    object 
 13  seas         160 non-null    object 
 14  yr           160 non-null    float64
 15  Criteria1    160 non-null    float64
 16  Criteria2    160 non-null    float64
 17  Criteria3    160 non-null    float64
 18  Ca           160 non-null    float64
 19  Mg      

In [5]:
# describe the data
ntn_data.describe()

Unnamed: 0,latitude,longitude,elevation,yr,Criteria1,Criteria2,Criteria3,Ca,Mg,K,...,NH4,NO3,Cl,SO4,H,Conduc,svol,ppt,fullChemLab,daysSample
count,162.0,162.0,161.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,...,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,40.144677,-91.536893,681.931677,2022.0,85.3625,98.84375,90.88125,10.373426,2.949839,0.724913,...,15.568128,9.067584,8.375543,7.092085,3.411206,6.14445,56569.451187,98.915944,36.675,370.0125
std,5.983882,17.318329,842.588401,0.0,5.780565,1.48598,6.118712,11.540159,3.932386,0.913519,...,10.478205,3.83286,16.13774,3.330336,2.165165,2.940796,26747.386863,40.547575,8.641388,2.550096
min,25.39,-156.6664,1.0,2022.0,59.0,90.0,62.0,-9.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,6493.1,13.513,11.0,357.0
25%,36.193275,-105.587075,174.0,2022.0,82.0,99.0,87.0,3.87195,1.295595,0.48583,...,8.37144,6.718145,1.80544,5.02003,1.9495,4.531,36510.575,75.04625,32.75,371.0
50%,40.33545,-87.04025,307.0,2022.0,85.0,99.0,92.0,6.9969,1.97424,0.652035,...,12.39084,8.93602,2.93384,6.62394,3.277,5.5635,58917.7,103.701,38.5,371.0
75%,43.4348,-78.547025,753.0,2022.0,89.25,100.0,95.25,12.2508,3.310965,0.89495,...,20.52666,11.617632,7.348705,9.050635,4.7915,6.8585,75072.975,125.96475,43.0,371.0
max,68.6257,-67.6308,3520.0,2022.0,100.0,100.0,100.0,90.0384,32.32818,2.65928,...,66.02904,21.09804,115.77384,16.4557,9.036,19.667,140951.0,246.034,51.0,372.0


In [6]:
# check for missing values
ntn_data.isnull().sum()

network          0
siteID           0
siteName         0
status           0
startDate_x      0
stopDate       160
county           0
state            0
latitude         0
longitude        0
elevation        1
stateName        0
siteClass        0
seas             2
yr               2
Criteria1        2
Criteria2        2
Criteria3        2
Ca               2
Mg               2
K                2
Na               2
NH4              2
NO3              2
Cl               2
SO4              2
H                2
Conduc           2
svol             2
ppt              2
fullChemLab      2
daysSample       2
startDate_y      2
lastDate         2
dtype: int64

In [7]:
# rename columns latitude and longitude to capitalize the first letter
ntn_data.rename(columns={"latitude": "Latitude", "longitude": "Longitude"}, inplace=True)
ntn_data.head()

Unnamed: 0,network,siteID,siteName,status,startDate_x,stopDate,county,state,Latitude,Longitude,...,Cl,SO4,H,Conduc,svol,ppt,fullChemLab,daysSample,startDate_y,lastDate
0,NTN,AK01,Poker Creek,A,12/29/92,,Fairbanks North Star,AK,65.155,-147.491,...,1.15661,2.60375,5.984,3.488,17184.5,32.619,29.0,371.0,Dec 28 2021 10:50PM,Jan 3 2023 11:00PM
1,NTN,AK02,Juneau,A,6/22/04,,Juneau,AK,58.5139,-134.7843,...,5.67021,2.22881,4.656,3.124,140951.0,246.034,46.0,371.0,Dec 28 2021 11:19PM,Jan 3 2023 8:42PM
2,NTN,AK96,Toolik Field Station,A,10/12/17,,North Slope Borough,AK,68.6257,-149.6069,...,1.72081,2.87454,5.689,3.668,13546.8,31.318,35.0,371.0,Dec 28 2021 8:45PM,Jan 3 2023 6:40PM
3,NTN,AK97,Katmai National Park - King Salmon,A,11/2/09,,Bristol Bay,AK,58.6794,-156.6664,...,13.23049,5.43663,7.211,5.246,47153.99,87.417,42.0,372.0,Dec 28 2021 7:30PM,Jan 4 2023 7:54PM
4,NTN,AL10,Black Belt Research & Extension Center,A,8/31/83,,Dallas,AL,32.4583,-87.2422,...,9.5914,8.62362,5.495,5.877,92996.8,153.01,37.0,371.0,Dec 28 2021 3:00PM,Jan 3 2023 3:45PM


In [8]:
# check for duplicates in the data
duplicates = ntn_data.duplicated(subset=["Latitude", "Longitude"])
duplicates.sum()

0

In [9]:
# Run the Profiling Report using ydata_profiling
# profile = ProfileReport(ntn_data,title = "Acid Rain Profile Report")
# profile.to_notebook_iframe()

In [10]:
# view the latitude and longitude datatypes to ensure they are numeric
ntn_data[["Latitude", "Longitude"]].dtypes

Latitude     float64
Longitude    float64
dtype: object

In [11]:
# pull in data from the OpenWeatherMap API
# Create an empty list to store the weather data
acid_rain_air_pollution = []
for i in ntn_data.index:
    # Get the latitude and longitude
    lat = ntn_data.loc[i, "Latitude"]
    lon = ntn_data.loc[i, "Longitude"]
    
    # Use the OpenWeatherMap API to get the weather data
    url = f"http://api.openweathermap.org/data/2.5/air_pollution?lat={lat}&lon={lon}&appid={airpollution_api_key}"
    acid_rain_ap = requests.get(url).json()
    
    
    # Append the weather data to the list
    coordinates = acid_rain_ap["coord"]
    components = acid_rain_ap['list'][0]["components"]
    acid_rain_air_pollution.append([coordinates['lat'], coordinates['lon'], components['co'], components['no'], components['no2'], components['o3'], components['so2'], components['pm2_5'], components['pm10'], components['nh3']])
    # # Print the city name and the index
    # print(f"Processing Record {i} | {ntn_data.loc[i, 'siteID']}")
    
    acid_rain_air_pollution


In [12]:


# Create an empty list to store the weather data
acid_rain_air_pollution = []

for i in ntn_data.index:
    # Get the latitude and longitude
    lat = ntn_data.loc[i, "Latitude"]
    lon = ntn_data.loc[i, "Longitude"]
    
    # Use the OpenWeatherMap API to get the weather data
    url = f"http://api.openweathermap.org/data/2.5/air_pollution?lat={lat}&lon={lon}&appid={airpollution_api_key}"
    try:
        acid_rain_ap = requests.get(url).json()
        
        # Check if response contains expected fields
        if "coord" in acid_rain_ap and "list" in acid_rain_ap:
            coordinates = acid_rain_ap["coord"]
            components = acid_rain_ap['list'][0]["components"]
            acid_rain_air_pollution.append([
                coordinates['lat'], coordinates['lon'],
                components['co'], components['no'],
                components['no2'], components['o3'],
                components['so2'], components['pm2_5'],
                components['pm10'], components['nh3']
            ])
        else:
            print(f"No data for index {i}")
    except Exception as e:
        print(f"Error fetching data for index {i}: {e}")

acid_rain_air_pollution


[[65.155, -147.491, 226.97, 0.48, 1.27, 94.41, 0.7, 0.5, 0.53, 0.03],
 [58.5139, -134.7843, 216.96, 0.21, 0.56, 88.69, 0.17, 0.5, 0.52, 0.73],
 [68.6257, -149.6069, 226.97, 0.03, 0.07, 95.84, 0.01, 0.5, 0.51, 0.01],
 [58.6794, -156.6664, 233.65, 0.03, 0.12, 103, 0.18, 0.5, 0.63, 0.01],
 [32.4583, -87.2422, 216.96, 0.02, 1.63, 91.55, 0.92, 5.52, 9.63, 0.88],
 [34.2886, -85.9699, 230.31, 0.02, 3.6, 70.81, 0.67, 7.48, 15.39, 13.93],
 [33.605, -92.0972, 210.29, 0, 1.23, 71.53, 0.39, 3.53, 7.39, 2.22],
 [34.1795, -93.0992, 208.62, 0.01, 1.93, 71.53, 1.13, 4.96, 10.42, 0.77],
 [36.0842, -92.5868, 206.95, 0, 0.75, 86.55, 0.28, 4.42, 9.96, 0.86],
 [36.1011, -94.1737, 203.61, 0.02, 2.1, 77.96, 0.54, 3.98, 10.08, 4.5],
 [36.0586, -112.184, 176.91, 0.01, 0.08, 108.72, 0.04, 0.6, 3.28, 0.67],
 [31.9492, -112.802, 208.62, 0.06, 0.39, 108.72, 0.06, 1.44, 7.06, 0.42],
 [34.8224, -109.8925, 183.58, 0.09, 0.58, 120.16, 0.75, 2.57, 17.51, 0.33],
 [32.0097, -109.3889, 191.93, 0.04, 0.25, 125.89, 0.15, 1.

In [13]:
# create a dataframe from the list of weather data
columns = ["Latitude", "Longitude", "CO", "NO", "NO2", "O3", "SO2", "PM2.5", "PM10", "NH3"]
rain_air_pollution_df = pd.DataFrame(acid_rain_air_pollution, columns=columns)   
rain_air_pollution_df.head()

Unnamed: 0,Latitude,Longitude,CO,NO,NO2,O3,SO2,PM2.5,PM10,NH3
0,65.155,-147.491,226.97,0.48,1.27,94.41,0.7,0.5,0.53,0.03
1,58.5139,-134.7843,216.96,0.21,0.56,88.69,0.17,0.5,0.52,0.73
2,68.6257,-149.6069,226.97,0.03,0.07,95.84,0.01,0.5,0.51,0.01
3,58.6794,-156.6664,233.65,0.03,0.12,103.0,0.18,0.5,0.63,0.01
4,32.4583,-87.2422,216.96,0.02,1.63,91.55,0.92,5.52,9.63,0.88


In [14]:
rain_air_pollution_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   162 non-null    float64
 1   Longitude  162 non-null    float64
 2   CO         162 non-null    float64
 3   NO         162 non-null    float64
 4   NO2        162 non-null    float64
 5   O3         162 non-null    float64
 6   SO2        162 non-null    float64
 7   PM2.5      162 non-null    float64
 8   PM10       162 non-null    float64
 9   NH3        162 non-null    float64
dtypes: float64(10)
memory usage: 12.8 KB


In [15]:
# merge the acid rain data with the air pollution data
acid_rain_ap_df = pd.concat([ntn_data, rain_air_pollution_df], axis=1)
acid_rain_ap_df.head()

Unnamed: 0,network,siteID,siteName,status,startDate_x,stopDate,county,state,Latitude,Longitude,...,Latitude.1,Longitude.1,CO,NO,NO2,O3,SO2,PM2.5,PM10,NH3
0,NTN,AK01,Poker Creek,A,12/29/92,,Fairbanks North Star,AK,65.155,-147.491,...,65.155,-147.491,226.97,0.48,1.27,94.41,0.7,0.5,0.53,0.03
1,NTN,AK02,Juneau,A,6/22/04,,Juneau,AK,58.5139,-134.7843,...,58.5139,-134.7843,216.96,0.21,0.56,88.69,0.17,0.5,0.52,0.73
2,NTN,AK96,Toolik Field Station,A,10/12/17,,North Slope Borough,AK,68.6257,-149.6069,...,68.6257,-149.6069,226.97,0.03,0.07,95.84,0.01,0.5,0.51,0.01
3,NTN,AK97,Katmai National Park - King Salmon,A,11/2/09,,Bristol Bay,AK,58.6794,-156.6664,...,58.6794,-156.6664,233.65,0.03,0.12,103.0,0.18,0.5,0.63,0.01
4,NTN,AL10,Black Belt Research & Extension Center,A,8/31/83,,Dallas,AL,32.4583,-87.2422,...,32.4583,-87.2422,216.96,0.02,1.63,91.55,0.92,5.52,9.63,0.88


In [17]:
# save acid_rain_ap_df to a csv file
acid_rain_ap_df.to_csv("../acid_rain_cleaned_data/acid_rain_air_pollution.csv", index=False)