In [17]:
import os
import json
import time
import uuid
import boto3
import psycopg2
import inquirer
import pandas as pd
import urllib.request
from selenium import webdriver
from collections import defaultdict
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class AURNScraper:
    """ 
    This class will navigate the website to collect air quality monitoring site information 
    an individual or all sites. For a single site the information will be stored as a dictionary
    and site images can be downloaded. For all sites the information will be saved as a .pkl
    file. From this database sites can be selected by distance from user specified coordinates
    to retrieve the air quality monitoring data for a specified year. Below is a step by step 
    example.

    Example
    -------
    Example 1 - Single Site: 
        AURN = AURNScraper()
        AURN.single_site_info(site_name='Port Talbot Margam', download_imgs=True)
    Example 2 - Multiple Sites:
        Step 1 - Retrieve site information for all AURN sites to .pkl file:
            AURN = AURNScraper()
            AURN.all_sites_info()
        Step 2 - Find sites within specified distance from specified coordinates:
            sites_for_download = AURN.find_sites_by_distance(X=394366, Y=807397, 
                                distance_m=50000) 
        Step 3 - Download monitoring data for chosen sites:
            AURN.download_monitoring_data(sites_for_download)

    Parameters
    ----------
    url : str
        URL address to the AURN website 
    """
    def __init__(self, url: str ='https://uk-air.defra.gov.uk/interactive-map'):
        self.url = url
        current_dir = os.getcwd()
        self.new_dir = os.path.join(current_dir, r'monitoring_files')
        if not os.path.exists(self.new_dir):
            os.makedirs(self.new_dir)
        chromeOptions = webdriver.ChromeOptions()
        prefs = {"download.default_directory" : self.new_dir}
        chromeOptions.add_experimental_option("prefs",prefs)
        chromeOptions.add_argument('--headless') # UNHASH WHEN COMPLETE
        chromeOptions.add_argument('--disable-gpu') # UNHASH WHEN COMPLETE
        self.driver = webdriver.Chrome(options=chromeOptions)
        self.driver.get(url)

# accept cookies
    def _accept_cookies(self):
        """ 
        This is a private method which accepts cookies when the webpage is initiated.
        """
        WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.ID, 'global-cookie-message')))
        cookie_window = self.driver.find_element_by_xpath("//div[@id='global-cookie-message']")
        cookie_window.find_element_by_xpath(".//button[@name='submit']").click()

# find specified site
    def single_site_info(self, site_name: str, download_imgs:bool = False) -> dict:
        """ 
        Returns site information (Environment type, X and Y Coordinates, Location and URL link) 
        for a specified single site.

        Parameters
        ----------
        site_name : str
            site name as per the AURN website
        
        download_imgs : bool
            Choose if you want to download pictures of this site. If set to True, images will be 
            downloaded to current directory. The default value is False.

        Returns
        -------
        Dictionary
            {'Site': f'{site_name}', 'Site Info: ': {'Env_Type': '[env type]', 'X_and_Y': '[coordinates]', 
            'Location': '[location]', 'Web Link': f'{site_info_link}'}}
        """
        site_info_dict = {'Name': [], 'Environment Type': [], 'Coordinates':[], 'Address':[], 'Web Link': [], 'Image Names':[]}
        self._accept_cookies()
        api_df = self._dataframe_API()
        my_site = api_df.loc[api_df['site_name'] == site_name, 'site info link'].iloc[0]
        site_info = self._retrieve_site_info(site_name, my_site, download_imgs)
        site_info_dict['Name'].append(site_name)
        site_info_dict['Environment Type'].append(site_info[0])
        site_info_dict['Coordinates'].append(site_info[1])
        site_info_dict['Address'].append(site_info[2])
        site_info_dict['Web Link'].append(site_info[3])
        site_info_dict['Image Names'].append(self._check_for_image_download(site_name))
        print(site_info_dict)
        return site_info_dict
    
    def _retrieve_site_info(self, site_name, this_site, retrieve_img=False):
        """ 
        This is a private method which collates the site information for each site.
        """

        self.driver.get(this_site)
        #retrieve data in dictionary: Site Name, Location, Environment Type, eastings, northings, pollutants measured?
        WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.ID, 'tab_info')))
        tab_info = self.driver.find_element_by_xpath("//div[@id='tab_info']")
        print(tab_info)
        my_tags = tab_info.find_elements_by_tag_name('p')
        for info in my_tags:
            if 'Environment Type' in info.text:
                env_type = info.text.split(': ')[1]
            elif 'Easting/Northing' in info.text:
                try:
                    X_co = int(info.text.split(': ')[1].split(', ')[0])
                    Y_co = int(info.text.split(': ')[1].split(', ')[1])
                    site_xy = [X_co, Y_co]
                except:
                    site_xy = info.text.split(': ')[1]
            elif 'Site Address' in info.text:
                site_address = info.text.split(': ')[1]
        if retrieve_img == True:
            self._retrieve_images(site_name)
        #print(env_type, site_xy, site_address)
        return [env_type, site_xy, site_address, this_site]
    
    def _retrieve_images(self, site_name, folder_name='image_files'):
        """
        This is a private method which downloads images for a single site if the
        user specifies to do this.
        """
        current_dir = os.getcwd()
        new_dir = os.path.join(current_dir, r'{f}'.format(f = folder_name))
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        site_photos = self.driver.find_element_by_xpath("//div[@class='carousel-inner']")
        all_photos = site_photos.find_elements_by_xpath("./div[@class='item']/*")
        filenumber = 0
        for link in all_photos:
            filename = f"{site_name}{filenumber}"
            if os.path.exists(f'{new_dir}/{filename}.jpg')==True: # Do not download if image already exists
                print(f"{filename} already exists. Image not replaced.")
            else:
                src = link.get_attribute('src')
                urllib.request.urlretrieve(src,f"{new_dir}/{filename}.jpg")
                filenumber += 1
        print ("Number of images downloaded: ", filenumber)
        return "Number of images downloaded: ", filenumber

    def all_sites_info(self) -> dict:
        """ 
        Returns site information (Environment type, X and Y Coordinates, Location and URL link) 
        for all sites on the AURN website.

        Parameters
        ----------
        N/A

        Returns
        -------
        Dictionary
            {'Site': f'{site_name}', 'Site Info: ': {'Env_Type': '[env type]', 'X_and_Y': '[coordinates]', 
            'Location': '[location]', 'Web Link': f'{site_info_link}'}}
            
            All_Sites_Output.pkl: Output file of all sites
        """
        try: # If All_Sites_Ouput.pkl already exists ask user if they want to overwrite it
            if os.path.isfile('All_Sites_Outputs.pkl') == True:
                question = {inquirer.Confirm('confirmed',
                    message="It looks like an output file with all the sites in has already been created. Do you want to overwrite this?",
                    default=True),}
                ans = inquirer.prompt(question)
            if ans['confirmed']==False:
                print("Ok. Method has ended.")
                return
        except: # Above question won't work unless file being called is a .py file
            pass
        self._accept_cookies()
        site_info_dict = {'UUID': [],'Name': [], 'Environment Type': [], 'Coordinates':[], 'Address':[], 'Web Link': [], 'Image Names':[]}
        api_df = self._dataframe_API()
        for i in range(len(api_df)):
            try:
                name_of = api_df['site_name'][i]
                print(type(name_of), name_of)
                site_info_dict['UUID'].append(uuid.uuid4())
                site_info_dict['Name'].append(name_of)
                api_site_link = api_df['site info link'][i]
                print(api_site_link)
                site_info = self._retrieve_site_info(name_of, api_site_link)
                print(site_info)
                site_info_dict['Environment Type'].append(site_info[0])
                site_info_dict['Coordinates'].append(site_info[1])
                site_info_dict['Address'].append(site_info[2])
                site_info_dict['Web Link'].append(site_info[3])
                site_info_dict['Image Names'].append(self._check_for_image_download(name_of))
                print(site_info_dict)
            except IndexError:
                print("Index Error", i)
                print (site_info_dict)
                pass
            except Exception as E:
                print("Error",E , i)
                pass
            finally:
                site_info_df = pd.DataFrame.from_dict(site_info_dict)
        self.driver.quit()    
        site_info_df = pd.DataFrame.from_dict(site_info_dict)
        site_info_df.to_pickle("All_Sites_Output.pkl")
        return site_info_df
        
    # find all sites within x distance
    def find_sites_by_distance(self, X: float, Y: float, distance_m: int) -> pd.DataFrame:
        """ 
        This method will find all the sites within a specified distance of specified
        X and Y coordinates.

        Example
        -------
        sites_for_download = AURN.find_sites_by_distance(X=394366, Y=807397, distance_m=50000)  

        Parameters
        ----------
        X : float
            The X axis coordinate of your specified point.

        Y : float
            The Y axis coordinate of your specified point.

        distance_m : int
            The distance from your specified point from which site information will be obtained. 

        Returns
        -------
        df : pd.DataFrame
            A dataframe of all the sites within the specified distance of the specified 
            coordinates.
        """
        try:
            all_sites_file = pd.read_pickle(r'All_Sites_Outputs.pkl')
        except:
            print("All_Sites_Ouputs.pkl not in current directory. Move this file to current directory or run 'all_sites_info' method to retrieve data")
            return
        if 'X' not in all_sites_file and 'Y' not in all_sites_file:
            all_sites_file.insert(2, 'X', 0)
            all_sites_file.insert(3, 'Y', 0)
            all_sites_file.insert(5, 'distance from point', 0)

        all_sites_file['X'] = all_sites_file['Coordinates'].str[0]
        all_sites_file['Y'] = all_sites_file['Coordinates'].str[1]
        ans = ((((X-all_sites_file['X'])**2)+((Y-all_sites_file['Y'])**2))**0.5)
        all_sites_file['distance from point'] = ans
        df = all_sites_file[all_sites_file['distance from point']< distance_m]
        print(f"Below are all sites within {distance_m / 1000}km of specified points")
        print(df)
        return df
    
    #download data
    def download_monitoring_data(self, dataframe: pd.DataFrame, year: int):
        """ 
        This will download All Hourly Pollutant Data for a specified year 
        from the AURN website for all sites in a specified dataframe.

        Parameters
        ----------
        dataframe : pd.DataFrame
            Dataframe of sites you wish to download data for. Format of this parameter
            should match the output of the 'find_sites_by_distance' method.

        year : int
            The year you wish to download data for. 

        Returns
        -------
        download_report : dict
            Dictionary of how many files were successfully downloaded and which (if any)
            sites were unsuccessfully downloaded.
            
        Downloaded Data : .csv files
            The downloaded data will appear in the current directory as individual csv 
            files.
        """
        self._accept_cookies()
        download_report = {'Successful Downloads Count': 0, 'Unsuccessful Download list': []}
        for index, row in dataframe.iterrows():
            self.driver.get(row['Web Link'])
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, "//div[@class='scrtabs-tab-scroll-arrow scrtabs-tab-scroll-arrow-left']")))
            path = self.driver.find_element_by_xpath("//ul[@class='nav nav-tabs nav-tabs-responsive']")
            networks = path.find_element_by_xpath("./li[@id='li_tab_networks']")
            tag_a = networks.find_element_by_tag_name("a")
            self.driver.execute_script("arguments[0].click();", tag_a)
            tab_networks = path.find_element_by_xpath("//div[@id='tab_networks']")
            formatted_data = tab_networks.find_element_by_link_text('Pre-Formatted Data Files')
            self.driver.execute_script("arguments[0].click();", formatted_data)
            table = self.driver.find_elements_by_xpath("//div[@class='table-responsive']/*")
            element = table[0]
            all_years = element.find_elements_by_tag_name('a')
            downloads_before_loop = download_report['Successful Downloads Count']
            for AURN_year, loop_no in zip(all_years, range(len(all_years))):
                if AURN_year.text == str(year):
                    linkname = AURN_year.get_attribute('href')
                    split_list = linkname.split('/')
                    name = split_list[len(split_list)-1].split('?')[0]
                    if os.path.exists(f'{self.new_dir}/{name}') == True:
                        print(f'{name} monitoring file already exists. File not replaced.')
                    else:
                        self.driver.execute_script("arguments[0].click();", AURN_year)
                        download_report['Successful Downloads Count'] += 1
                if (loop_no+1) == len(all_years):
                    if downloads_before_loop == download_report['Successful Downloads Count']:
                        download_report['Unsuccessful Download list'].append(row['Name'])
        print(download_report)
        return download_report
    
    # convert pkl records to json file
    def pkl_to_json(self, folder_name : str='json_files'):
        ''' 
        This will convert individual records in the All_Sites_Outputs.pkl file into json records
        and save these in the "json files" directory or create this directory if it doesn't 
        already exist.

        Parameters
        ----------
        folder_name : str
            Name of folder you wish to save json files in. The default is "json files"
        
        Returns
        -------
        json files : .json
            Individual records from All_Sites_Outputs.pkl as .json.       
        '''
        current_dir = os.getcwd()
        new_dir = os.path.join(current_dir, r'{f}'.format(f = folder_name))
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)
        
        output_file = pd.read_pickle(r'All_Sites_Outputs.pkl')
        for index, row in output_file.iterrows():
            site_name = output_file.iloc[index]['Name']
            output_file.iloc[index].to_json(r'{nd}/{si}.json'.format(nd = new_dir, si = site_name))
        print(f'individual record of output file have been converted to json files in {new_dir}')
    
    def upload_directory_to_s3(self, folder : str, bucketname : str='airqualitywebscraperbucket'):
        ''' 
        This method will upload the contents of your chosen folder to an AWS S3 bucket.
        Chosen folder is likely to be either "image_files", "json_files", "monitoring_files".

        Parameters
        ----------
        folder : str
            The name of the folder from which you wish to upload the contents to AWS S3 bucket.

        Returns
        -------
        Folder contents inputted to AWS S3 bucket. 

        '''
        current_dir = os.getcwd()
        path = os.path.join(current_dir,folder)
        s3_client = boto3.client('s3')
        for root,dirs,files in os.walk(path):
            for file in files:
                s3_client.upload_file(os.path.join(root,file),bucketname,file)
        return
    
    def _check_for_image_download(self, site_name : str) -> list:
        current_dir = os.getcwd()
        image_dir = os.path.join(current_dir, r'{f}'.format(f = 'image_files'))    
        contin = True
        val = 0
        image_name_list = []
        while contin == True:
            if os.path.exists(f'{image_dir}/{site_name}{val}.jpg'):
                image_name_list.append(f'{site_name}{val}')
                val += 1
            else:
                contin = False
        if len(image_name_list) == 0:
            image_name_list.append("No Downloaded Images")
        return image_name_list

    def _check_site_in_RDS(self, site_name):
        connection = psycopg2.connect(user='postgres',
                                password='mysecretpassword',
                                host='airqualityscraper.clbqzprnzcak.eu-west-2.rds.amazonaws.com',
                                port=5432,
                                # server='scraper_data',
                                database='postgres')
        cursor = connection.cursor()
        postgreSQL_select_Query = f"""Select "Name" FROM aq_data WHERE "Name" IN ('{site_name}');"""
        cursor.execute(postgreSQL_select_Query)
        my_val = cursor.fetchall()
        if site_name in my_val[0]:
            return True
        else:
            return False
    
    # def _dataframe_API(self, API_loc="AURN_API.json"):
    #     json_file_path = API_loc
    #     with open(json_file_path, 'r') as j:
    #         contents = json.loads(j.read())
    #     df = pd.DataFrame(contents['aurn'])
    #     site_info_link = 'https://uk-air.defra.gov.uk/networks/site-info?uka_id='
    #     df['site info link'] = site_info_link + df['uka_id']
    #     return df
    def _dataframe_API(self,API_loc="AURN_API.json"):
        '''
        This is a private method that converts the json API with the
        link to all AURN sites into a usable dataframe.
        '''
        json_file_path = API_loc
        with open(json_file_path, 'r') as j:
            contents = json.loads(j.read())
        my_list = []
        for i in contents.keys():
            #print(f"key: {i}", len(contents[i]))
            df = pd.DataFrame(contents[i])
            #df.replace("", numpy.NaN, inplace=True)
            my_list.append(df)
        df = pd.concat(my_list, axis=0)
        df.drop(columns=['exception','parameter_ids',
                'network_name', 'network_id','site_status',
                'overall_index','environment_id','country_id']
                ,axis=0, inplace=True)
        site_info_link = 'https://uk-air.defra.gov.uk/networks/site-info?uka_id='
        df['site info link'] = site_info_link + df['uka_id']
        df = df[['site_name', 'site info link']]
        return df

In [18]:
AURN = AURNScraper()
#AURN._dataframe_API()
#AURN.single_site_info('Sheffield Barnsley Road',download_imgs=False)
AURN.all_sites_info()

<class 'pandas.core.series.Series'> 0                       Aberdeen Erroll Park
0    Anglesey Penhesgyn 3 (Isle of Anglesey)
0                           Auchencorth Moss
0                            Barnsley Gawber
0                           Auchencorth Moss
0                           Auchencorth Moss
0                           Auchencorth Moss
0                           Auchencorth Moss
0                           Auchencorth Moss
0                                    Belfast
0                                  Goonhilly
0                           Auchencorth Moss
0                                  Goonhilly
0                   Ainsdale Dunes and Sands
0                           Auchencorth Moss
0             Aberdeen Union Street Roadside
Name: site_name, dtype: object
0    https://uk-air.defra.gov.uk/networks/site-info...
0    https://uk-air.defra.gov.uk/networks/site-info...
0    https://uk-air.defra.gov.uk/networks/site-info...
0    https://uk-air.defra.gov.uk/networks/site-i

ValueError: All arrays must be of the same length

In [6]:
print(my_site)

{'Name': ['Port Talbot Margam'], 'Environment Type': ['Urban Industrial'], 'Coordinates': [[277406, 188719]], 'Address': ['Port Talbot'], 'Web Link': ['https://uk-air.defra.gov.uk/networks/site-info?uka_id=UKA00501']}


In [13]:
print(type(my_site['Coordinates'][0][0]))

<class 'int'>


In [5]:
AURN = AURNScraper('https://uk-air.defra.gov.uk/interactive-map')
AURNScraper.single_site_info.__doc__

" \n        Returns site information (Environment type, X and Y Coordinates, Location and URL link) \n        for a specified single site.\n\n        Parameters\n        ----------\n        site_name : str\n            site name as per the AURN website\n\n        Returns\n        -------\n        Dictionary\n            {'Site': f'{site_name}', 'Site Info: ': {'Env_Type': '[env type]', 'X_and_Y': '[coordinates]', \n            'Location': '[location]', 'Web Link': f'{site_info_link}'}}\n        "

In [39]:
AURN = AURNScraper('https://uk-air.defra.gov.uk/interactive-map')
my_df = AURN.all_sites_info()

In [3]:
AURN = AURNScraper()
sites_for_download = AURN.find_sites_by_distance(X=394366, Y=807397, distance_m=50000)

Below are all sites within 50.0km of specified points
                              Name  Environment Type       X       Y  \
0             Aberdeen Erroll Park  Urban Background  394366  807397   
19  Aberdeen Union Street Roadside     Urban Traffic  393656  805968   
20        Aberdeen Wellington Road     Urban Traffic  394397  804779   

         Coordinates  distance from point        Address  \
0   [394366, 807397]             0.000000  Not available   
19  [393656, 805968]          1595.663185       Aberdeen   
20  [394397, 804779]          2618.183531       Aberdeen   

                                             Web Link  
0   https://uk-air.defra.gov.uk/networks/site-info...  
19  https://uk-air.defra.gov.uk/networks/site-info...  
20  https://uk-air.defra.gov.uk/networks/site-info...  


In [6]:
for index, row in sites_for_download.iterrows():
    print(row['Coordinates'])

[394366, 807397]
[393656, 805968]
[394397, 804779]


In [9]:
print(sites_for_download.iloc[0]['Coordinates'])

[394366, 807397]


In [13]:
AURN = AURNScraper('https://uk-air.defra.gov.uk/interactive-map')
AURN.download_monitoring_data(sites_for_download)

{'Sucessful Downloads Count': 2,
 'Unsuccessful Download list': ['Aberdeen Erroll Park']}

In [23]:
import pandas as pd
object = pd.read_pickle(r'All_Sites_Outputs.pkl')
object.head()

Unnamed: 0,Name,Environment Type,Coordinates,Address,Web Link
0,Aberdeen Erroll Park,Urban Background,"[394366, 807397]",Not available,https://uk-air.defra.gov.uk/networks/site-info...
1,Aston Hill,Rural Background,"[329899, 290053]",Aston Hill,https://uk-air.defra.gov.uk/networks/site-info...
2,Auchencorth Moss,Rural Background,"[322166, 656128]",Auchencorth,https://uk-air.defra.gov.uk/networks/site-info...
3,Blackpool Marton,Urban Background,"[333768, 434759]",Blackpool,https://uk-air.defra.gov.uk/networks/site-info...
4,Bush Estate,Rural Background,"[324629, 663891]",Bush Estate,https://uk-air.defra.gov.uk/networks/site-info...


In [7]:
import os
import inquirer
try:
    if os.path.isfile('All_Sites_Outputs.pkl') == True:
        question = {inquirer.Confirm('confirmed',
                                    message="It looks like an output file with all the sites in has already been created. Do you want to overwrite this?",
                                    default=True),}
        ans = inquirer.prompt(question)
    if ans['confirmed']==False:
        quit()
    print("Good job!")
except:
    print("just run")
    pass
print("and keep running")





[?] It looks like an output file with all the sites in has already been cre...: 
just run
and keep running


In [34]:
# Download images of site
import urllib.request
url = 'https://uk-air.defra.gov.uk/interactive-map'
driver = webdriver.Chrome()
driver.get(url)

WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//div[@style='position: absolute; left: 0px; top: 0px; z-index: 106; width: 100%;']")))
my_map = driver.find_elements_by_xpath("//div[@style='position: absolute; left: 0px; top: 0px; z-index: 106; width: 100%;']/*")
site_info_dict = {'name': [], 'Environment Type': [], 'Coordinates':[], 'Address':[], 'Web Link': []}
print(len(my_map))
link = my_map[0]
driver.execute_script("arguments[0].click();", link)
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'popupContent')))
popup = driver.find_element_by_xpath("//div[@id='popupContent']")
my_tags = popup.find_elements_by_tag_name('a')
for link in my_tags:
    if link.text == 'Site Information':
        site_info_link = link.get_attribute('href')
        #print(site_info_link)
driver.get(site_info_link)

#### images
site_photos = driver.find_element_by_xpath("//div[@class='carousel-inner']")
all_photos = site_photos.find_elements_by_xpath("./div[@class='item']/*")
print(len(all_photos))
filename = 1
for link in all_photos:
    src = link.get_attribute('src')
    print(src)
    urllib.request.urlretrieve(src,f"{filename}.jpg")
    filename += 1
driver.quit()


171
4
https://uk-air.defra.gov.uk/assets/site-photos/BPLE_n.jpg
https://uk-air.defra.gov.uk/assets/site-photos/BPLE_e.jpg
https://uk-air.defra.gov.uk/assets/site-photos/BPLE_s.jpg
https://uk-air.defra.gov.uk/assets/site-photos/BPLE_w.jpg


In [None]:
# Memoization of dictionary atrtibutes

def download_all_sites_memo(memo=None):
    if memo is None:
        memo = {'Name': [], 'Environment Type': [], 'Coordinates':[], 'Address':[], 'Web Link': []}
    
    link = my_map2[i]
    self.driver.execute_script("arguments[0].click();", link) # This is the updated link.click() to overcome spatial error on map with click
    name_of = link.get_attribute('title')  
    if memo['Name'] == name_of:
        print("This name is already in dictionary")
    else:    
        site_info_dict['Name'].append(name_of)
        site_info = self._retrieve_site_info2(name_of, link)

In [4]:
import pandas as pd
object = pd.read_pickle(r'All_Sites_Outputs.pkl')
len(object) #.head()
# dict(object)

171

In [31]:
import pandas as pd
import json
import os
object1 = pd.read_pickle(r'All_Sites_Outputs.pkl')
folder_name = 'json files'

# site_name = object.iloc[0]['Name']
# result = object.iloc[0].to_json(r'site_name')
# parsed = json.loads(result)
# json.dumps(parsed)

current_dir = os.getcwd()
new_dir = os.path.join(current_dir, r'{f}'.format(f = folder_name))
if not os.path.exists(new_dir):
    os.makedirs(new_dir)

for index, row in object1.iterrows():
    print ("index", index)
    print ("row",  row)
    site_name = object1.iloc[index]['Name']
    object1.iloc[index].to_json(r'{nd}/{si}'.format(nd = new_dir, si = site_name))
    ;ogkjpiyb


index 0
row Name                                             Aberdeen Erroll Park
Environment Type                                     Urban Background
Coordinates                                          [394366, 807397]
Address                                                 Not available
Web Link            https://uk-air.defra.gov.uk/networks/site-info...
Name: 0, dtype: object


NameError: name 'ogkjpiyb' is not defined

In [32]:
import boto3
s3_client = boto3.client('s3')

boto3.client('s3').upload

response = s3_client.upload_folder()

In [None]:
def uploadDirectory(folder : str, bucketname : str='airqualitywebscraperbucket'):
    s3_client = boto3.client('s3')
    for root,dirs,files in os.walk(path):
        for file in files:
            s3_client.upload_file(os.path.join(root,file),bucketname,file)
    return


In [6]:
filename = 'https://uk-air.defra.gov.uk/data_files/site_data/WAL4_2022.csv?v=1'
val = filename.split('/')
#my_len = len(val)
name = val[len(val)-1]
name = name.split('?')
name = name[0]
print(name)


WAL4_2022.csv


In [7]:
split_list = filename.split('/')
name = split_list[len(split_list)-1].split('?')[0]
print(name)

WAL4_2022.csv


In [1]:
import json
import numpy
import pandas as pd
# json_file_path = "AURN_API.json"
# with open(json_file_path, 'r') as j:
#      contents = json.loads(j.read())
# #print(contents.keys())
# #my_list = []
# df1 = pd.DataFrame(contents['aurn'])
# df2 = pd.DataFrame(contents['nondefraaqmon'])


# df = pd.concat([df1,df2])
# site_info_link = 'https://uk-air.defra.gov.uk/networks/site-info?uka_id='
# df['site info link'] = site_info_link + df['uka_id']
# print(df)
# df.head()
# len(df)

def _dataframe_API(API_loc="AURN_API.json"):
     '''
     This is a private method that converts the json API with the
     link to all AURN sites into a usable dataframe.
     '''
     json_file_path = API_loc
     with open(json_file_path, 'r') as j:
          contents = json.loads(j.read())
     my_list = []
     for i in contents.keys():
          #print(f"key: {i}", len(contents[i]))
          df = pd.DataFrame(contents[i])
          #df.replace("", numpy.NaN, inplace=True)
          my_list.append(df)
     df = pd.concat(my_list, axis=0)
     df.drop(columns=['exception','parameter_ids',
            'network_name', 'network_id','site_status',
            'overall_index','environment_id','country_id']
            ,axis=0, inplace=True)
     site_info_link = 'https://uk-air.defra.gov.uk/networks/site-info?uka_id='
     df['site info link'] = site_info_link + df['uka_id']
     df = df[['site_name', 'site info link']]
     # df.to_csv()

     # df = dict(df)
     # for key, value in df.items(): 
     #      print(key, len(value), sep=" | ")
     return df

_dataframe_API()

# import pandas as pd
# import numpy as np
# import json
# json_file_path = "AURN_API.json"
# with open(json_file_path, 'r') as j:
#      contents = json.loads(j.read())

# my_list = []
# for i in contents.keys():
#     df= pd.DataFrame(contents[i])
#     my_list.append(df)

# big_df = pd.concat(my_list)
# # big_df.replace(r'^\s*$',np.nan,regex=True,inplace=True)
# big_df.drop(columns=['exception','parameter_ids',
#             'network_name', 'network_id','site_status',
#             'overall_index','environment_id','country_id']
#             ,axis=0, inplace=True) #,'parameter_ids','network_name'])
# #df = pd.DataFrame(contents['aurn'])
# big_df.to_csv('big_df.csv')


# my_site = df['site info link'].loc[df['site_name']=='Sheffield Barnsley Road']
# #type(my_site[0])
# my_site = df.loc[df['site_name']=='Sheffield Barnsley Road'].index
# df['site info link'].iloc[my_site]
# my_s = dict(df['site info link'].iloc[my_site])
# my_s[str(my_site)]

#df.loc[df['site_name'] == 'Sheffield Barnsley Road', 'site info link'].iloc[0]

# df['site_name'].iloc[0]
# len(df)
#

Unnamed: 0,site_name,site info link
0,Aberdeen Erroll Park,https://uk-air.defra.gov.uk/networks/site-info...
1,Aberdeen Union Street Roadside,https://uk-air.defra.gov.uk/networks/site-info...
2,Aberdeen Wellington Road,https://uk-air.defra.gov.uk/networks/site-info...
3,Armagh Roadside,https://uk-air.defra.gov.uk/networks/site-info...
4,Aston Hill,https://uk-air.defra.gov.uk/networks/site-info...
...,...,...
301,UUNN_WELW_002,https://uk-air.defra.gov.uk/networks/site-info...
302,UUNN_WIGA_001,https://uk-air.defra.gov.uk/networks/site-info...
303,UUNN_WIGA_002,https://uk-air.defra.gov.uk/networks/site-info...
304,UUNN_WIGA_003,https://uk-air.defra.gov.uk/networks/site-info...


In [32]:
data = len(_dataframe_API()['network_name'])
print(data)

1234


In [14]:
import requests
url = 'https://uk-air.defra.gov.uk/js/map_data.php?c=833f31fa9e72923ac4402e26f4afd5572460ad92'
response = requests.get(url)
response


<Response [403]>