In [3]:
from curses.ascii import alt
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, TimeoutException
from tqdm import tqdm
import time
import uuid
import os
import json
import urllib.request
import sys
sys.path.append('../')
from src.aws import AwsScraper


In [15]:

class Scraper:
    """ This class contains all the navigation & data collection methods
    of a webscraper.
    
    Parameters
    ----------
    url : str 
        the url of the desired website  
    """

#to access website
    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        url = "https://www.medexpress.co.uk"
        self.driver.get(url)
        time.sleep(2)
        self.delay = 10
        

        self.aws = AwsScraper()
        # set storage location
        self.data_store = "./raw_data"

    
    def get_drug_class(self): 
    
        """
        This method gets the drug class to be scraped.

        """
        self.class_choice = input("Enter drug class: ")
        return self.class_choice

    def get_class_links(self):


        """ This methods returns the link of the desired class and 
        puts them in a dictionary corresponding to their classes

        Returns
        --------
        str
            a string with the url of the class of drugs
        """
        #get user choice
        self.get_drug_class()
        # get drug class container
        class_container = self.driver.find_element(By.XPATH, '//div[@class="row margintop20"]')
        class_a_tag = class_container.find_elements(By.TAG_NAME, 'a')
        most_pop_class_tags = class_a_tag[0:8]
        drug_classes = [item.get_attribute('href') for item in most_pop_class_tags]
        # print(drug_classes)
        
        class_dictionary = {0 :'erectile dysfunction', 1 : 'covid', 2 : 'migraine', 3 : 'period delay', 4 : 'asthma', 5 : 'herpes', 6 : 'acne', 7 : 'hair loss'}

        for key, value in class_dictionary.items():
            # if class is given
            if value == self.class_choice.lower():
                drug_class = drug_classes[key]
                return drug_class
    
    def get_drug_links(self):
        """
        This method gets the links of all drugs in each class and the names of the drugs in the list.

        Returns
        --------
        lists
            list containing the name of drugs in the class
            list containing hyperlinks to all drugs in the class
        """
        global drugs_list
        drug_class = self.get_class_links()
        # access class link
        self.driver.get(drug_class)
        time.sleep(2)
        # access drug list and links on class page 
        drug_links_container = self.driver.find_element(By.XPATH, '//div[@class="panel-treatment-row"]')
        link_tag = drug_links_container.find_elements(By.TAG_NAME, 'a')
        drugs_links = [item.get_attribute('href') for item in link_tag]
        drugs_list = [name.split('/')[-1] for name in drugs_links]
        dict_self = {i : point.split('/')[-1] for i, point in enumerate(drugs_links)}
        # print(dict_self)
        return (drugs_links)

    def get_metadata(self):
        """
        This method collects metadata from each drug page
        Returns
        --------
        Dictionary with meta data on each drug
        """
        global drug_name
        drug_dictionary = {}
        self.metadata_list = []
        # Goes to each drug link and scrapes relevant data from it
        drug_links = self.get_drug_links()
        for i in drug_links:
            self.driver.get(i)
            time.sleep(2)
            drug_name = self.driver.find_element(By.XPATH, '//div[@class="col-sm-7 product-row-title"]/h1').text
            #create drug folder inside raw_data folder and begin appending key/value pairs to dictionary
            self._create_metadata_folders(f'raw_data/{drug_name}')
            drug_dictionary["DRUG NAME"] = drug_name
            drug_dictionary["ALTERNATIVES"] = drugs_list
            time.sleep(2)
            # try and except statemnets used to bypass heterogeneity in websites htmls
            try:
                doses = self.driver.find_element(By.XPATH, '//ul[@class="nav nav-tabs strengthMenuTab"]')
                doze = doses.find_elements(By.XPATH, '//span[@class="tab-dosage"]')
                dosages = [items.text for items in doze]
                dose = dosages[0:3]
            except:
                dose = 'only one dose available'
            drug_dictionary["DOSAGES AVAILABLE"] = dose
            time.sleep(2)
            #Scrape quantity for each drug
            quant = self.driver.find_element(By.XPATH, '//span[@class="select-container select-container-product"]')
            quanti = quant.find_elements(By.XPATH, '//select[@class="quantityMenu"]/option')
            quantity = [q.text for q in quanti]
            quanitities = [x for x in quantity if x != '']
            drug_dictionary["QUANTITY AVAILABLE"] = quanitities
            #apppen uuid and unique id of code which in this case is the website link as the drug page had no visible unique id
            product_id_num = str(uuid.uuid4())
            product_id_num = product_id_num[:8]
            drug_dictionary["UUID"] = product_id_num
            time.sleep(2)
            # get the price
            pri = self.driver.find_element(By.XPATH, '//div[@class="sitewide-sale-price-wrapper"]')
            price = pri.find_element(By.TAG_NAME, 'span')
            drug_dictionary["PRICE"] = price.text
            #get the reviews with a try and except block for drugs without reviews
            try:
                reviews = self.driver.find_element(By.XPATH, '//div[@class="feefo-rating-big"]/span').text
            except:
                reviews = 'No reviews available'
            
            drug_dictionary["REVIEWS"] = reviews
            time.sleep(2)
            # get the drug information
            drug_inf= self.driver.find_element(By.XPATH, '//amp-accordion[@class="i-amphtml-element i-amphtml-layout-container i-amphtml-built i-amphtml-layout"]')
            
            drug_info = drug_inf.find_element(By.XPATH, '//div[@class="tab-pane i-amphtml-accordion-content"]')
            try:
                drun = drug_info.find_elements(By.TAG_NAME, 'p')
            except:
                self.driver.implicitly_wait(10)
                drun = drug_info.find_elements(By.TAG_NAME, 'h2')
                drug_infoo = [drug.text for drug in drun]
            drug_infoo = [drug.text for drug in drun]
            drug_dictionary["INFORMATION"] = drug_infoo
            drug_dictionary["DRUG URL"] = i
            return drug_dictionary

    def save_data(self):
   
        # create the data.json from the above dictionary
        with open(f"raw_data/{drug_name}/data.json", "w") as f:
            drug_dictionary = self.get_metadata()
            json_output = json.dump(drug_dictionary, f)
            return json_output
    

    @staticmethod  
    def _create_metadata_folders(folder_name: str):
        """This method creates different folders for data storage
        
        Parameters
        ----------
        directory_name : str
            a string representing the name of a new folder to be created and cd into
        """
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
       
    def get_image(self):
        """
        This Method created a folder called 'images' in the raw data folder
        scrapes images for each drug and stores them in the folder.

        Returns
        --------
        images stored as jpg files

        """
        drug_links = self.get_drug_links()
        for i in drug_links:
            self.driver.get(i)
            time.sleep(2)
            # get image 
            self._create_metadata_folders(f'raw_data/images')
            image = self.driver.find_element(By.XPATH, '//div[@class="i-amphtml-carousel-scroll"]')
            image_img = image.find_elements(By.TAG_NAME, 'img')
            
            for i in image_img:
                try:
                    images_src = i.get_attribute('src') 
                    self.alt = i.get_attribute('alt')
                    urllib.request.urlretrieve(images_src, f"raw_data/images/{self.alt}.jpg")

                    s3_url = self.aws.upload_file_method(f"raw_data/images/{self.alt}.jpg", self.alt)
                    print(s3_url)
                except Exception as e:
                    print(e)
                    return None

                finally:
                    
                    if os.path.exists(f"raw_data/images/{self.alt}"):
                    
                        os.remove(f"raw_data/images/{self.alt}")


    # def data_dump(self, drug_name):
    #     """
    #     Parameters:
    #     ----------
    #     folder_name: str
    #         String value of the folder path for each player's data store
    #     Returns:
    #     -------
    #     None
    #     """
    #     try:
    #         pic_file = (f"raw_data/images/{self.alt}.jpg")
    #     except Exception:

    #         pic_file = ""

    #     self.upload_file_method((f"raw_data/{drug_name}/data.json"), drug_name, pic_file)
    #     # return AwsScraper.upload_file_method()
       

if __name__ == "__main__":
    bot = Scraper()
    # bot.get_metadata()
    # bot.save_data()
    # bot.get_image()
    # bot.data_dump()
    bot.get_image()




[WDM] - Current google-chrome version is 104.0.5112
[WDM] - Get LATEST chromedriver version for 104.0.5112 google-chrome
[WDM] - Driver [/Users/pearl/.wdm/drivers/chromedriver/mac64_m1/104.0.5112.79/chromedriver] found in cache
  self.driver = webdriver.Chrome(ChromeDriverManager().install())


upload_file_method() missing 1 required positional argument: 'drug_name'


In [None]:



class Scraper:
    """ This class contains all the navigation & data collection methods
    of a webscraper.
    
    Parameters
    ----------
    url : str 
        the url of the desired website  
    """

#to access website
    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        url = "https://www.medexpress.co.uk"
        self.driver.get(url)
        time.sleep(2)
        self.delay = 10

    # def load_page(self, url) -> None:
    #     """Loads the webpage with a given url in Google Chrome.
    #     Args:
    #         url (str): URL of target webpage to be loaded.
    #     Returns:
    #         None: Page loads but returns nothing.
    #     """
    #     self.driver.get(url)

        
       


    def get_drug_class(self): 
    
        """
        This method gets the drug class to be scraped.

        """
        self.class_choice = input("Enter drug class: ")
        return self.class_choice

    def get_class_links(self):


        """ This methods returns the link of the desired class and 
        puts them in a dictionary corresponding to their classes

        Returns
        --------
        str
            a string with the url of the class of drugs
        """
        #get user choice
        self.get_drug_class()
        # get drug class container
        class_container = self.driver.find_element(By.XPATH, '//div[@class="row margintop20"]')
        class_a_tag = class_container.find_elements(By.TAG_NAME, 'a')
        most_pop_class_tags = class_a_tag[0:8]
        drug_classes = [item.get_attribute('href') for item in most_pop_class_tags]
        # print(drug_classes)
        
        class_dictionary = {0 :'erectile dysfunction', 1 : 'covid', 2 : 'migraine', 3 : 'period delay', 4 : 'asthma', 5 : 'herpes', 6 : 'acne', 7 : 'hair loss'}

        for key, value in class_dictionary.items():
            # if class is given
            if value == self.class_choice.lower():
                drug_class = drug_classes[key]
                return drug_class
    
    def get_drug_links(self):
        """
        This method gets the links of all drugs in each class and the names of the drugs in the list.

        Returns
        --------
        lists
            list containing the name of drugs in the class
            list containing hyperlinks to all drugs in the class
        """
        global drugs_list
        drug_class = self.get_class_links()
        # access class link
        self.driver.get(drug_class)
        time.sleep(2)
        # access drug list and links on class page 
        drug_links_container = self.driver.find_element(By.XPATH, '//div[@class="panel-treatment-row"]')
        link_tag = drug_links_container.find_elements(By.TAG_NAME, 'a')
        drugs_links = [item.get_attribute('href') for item in link_tag]
        drugs_list = [name.split('/')[-1] for name in drugs_links]
        dict_self = {i : point.split('/')[-1] for i, point in enumerate(drugs_links)}
        # print(dict_self)
        return (drugs_links)

    def get_metadata(self):
        """
        This method collects metadata from each drug page
        Returns
        --------
        Dictionary with meta data on each drug
        """
        global drug_name
        drug_dictionary = {}
        self.metadata_list = []
        # Goes to each drug link and scrapes relevant data from it
        drug_links = self.get_drug_links()
        for i in drug_links:
            self.driver.get(i)
            time.sleep(2)
            drug_name = self.driver.find_element(By.XPATH, '//div[@class="col-sm-7 product-row-title"]/h1').text
            #create drug folder inside raw_data folder and begin appending key/value pairs to dictionary
            self._create_metadata_folders(f'raw_data/{drug_name}')
            drug_dictionary["DRUG NAME"] = drug_name
            drug_dictionary["ALTERNATIVES"] = drugs_list
            time.sleep(2)
            # try and except statemnets used to bypass heterogeneity in websites htmls
            try:
                doses = self.driver.find_element(By.XPATH, '//ul[@class="nav nav-tabs strengthMenuTab"]')
                doze = doses.find_elements(By.XPATH, '//span[@class="tab-dosage"]')
                dosages = [items.text for items in doze]
                dose = dosages[0:3]
            except:
                dose = 'only one dose available'
            drug_dictionary["DOSAGES AVAILABLE"] = dose
            time.sleep(2)
            #Scrape quantity for each drug
            quant = self.driver.find_element(By.XPATH, '//span[@class="select-container select-container-product"]')
            quanti = quant.find_elements(By.XPATH, '//select[@class="quantityMenu"]/option')
            quantity = [q.text for q in quanti]
            quanitities = [x for x in quantity if x != '']
            drug_dictionary["QUANTITY AVAILABLE"] = quanitities
            #apppen uuid and unique id of code which in this case is the website link as the drug page had no visible unique id
            product_id_num = str(uuid.uuid4())
            product_id_num = product_id_num[:8]
            drug_dictionary["UUID"] = product_id_num
            time.sleep(2)
            # get the price
            pri = self.driver.find_element(By.XPATH, '//div[@class="sitewide-sale-price-wrapper"]')
            price = pri.find_element(By.TAG_NAME, 'span')
            drug_dictionary["PRICE"] = price.text
            #get the reviews with a try and except block for drugs without reviews
            try:
                reviews = self.driver.find_element(By.XPATH, '//div[@class="feefo-rating-big"]/span').text
            except:
                reviews = 'No reviews available'
            
            drug_dictionary["REVIEWS"] = reviews
            time.sleep(2)
            # get the drug information
            drug_inf= self.driver.find_element(By.XPATH, '//amp-accordion[@class="i-amphtml-element i-amphtml-layout-container i-amphtml-built i-amphtml-layout"]')
            
            drug_info = drug_inf.find_element(By.XPATH, '//div[@class="tab-pane i-amphtml-accordion-content"]')
            try:
                drun = drug_info.find_elements(By.TAG_NAME, 'p')
            except:
                self.driver.implicitly_wait(10)
                drun = drug_info.find_elements(By.TAG_NAME, 'h2')
                drug_infoo = [drug.text for drug in drun]
            drug_infoo = [drug.text for drug in drun]
            drug_dictionary["INFORMATION"] = drug_infoo
            drug_dictionary["DRUG URL"] = i
            print(drug_dictionary, '\n')

    def save_data(self):
   
        # create the data.json from the above dictionary
        with open(f"raw_data/{drug_name}/data.json", "w") as f:
            drug_dictionary = self.get_metadata()
            json_output = json.dump(drug_dictionary, f)
            return json_output
    

    @staticmethod  
    def _create_metadata_folders(folder_name: str):
        """This method creates different folders for data storage
        
        Parameters
        ----------
        directory_name : str
            a string representing the name of a new folder to be created and cd into
        """
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
       
    def get_image(self):
        """
        This Method created a folder called 'images' in the raw data folder
        scrapes images for each drug and stores them in the folder.

        Returns
        --------
        images stored as jpg files

        """
        drug_links = self.get_drug_links()
        for i in drug_links:
            self.driver.get(i)
            time.sleep(2)
            # get image 
            self._create_metadata_folders(f'raw_data/images')
            image = self.driver.find_element(By.XPATH, '//div[@class="i-amphtml-carousel-scroll"]')
            image_img = image.find_elements(By.TAG_NAME, 'img')
            
            for i in image_img:
                images_src = i.get_attribute('src') 
                alt = i.get_attribute('alt')
                urllib.request.urlretrieve(images_src, f"raw_data/images/{alt}.jpg")
               

if __name__ == "__main__":
    bot = Scraper()
    bot.get_metadata()
    # bot.get_image()




[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/pearl/.wdm/drivers/chromedriver/mac64_m1/103.0.5060.134/chromedriver] found in cache
  self.driver = webdriver.Chrome(ChromeDriverManager().install())


{'DRUG NAME': 'COVID-19 Home Test Kit', 'ALTERNATIVES': ['coronavirus-covid19-home-testing-kit-pcr', 'coronavirus-covid19-antibody-test', 'lateral-flow-test', 'personal-lateral-flow-test'], 'DOSAGES AVAILABLE': 'only one dose available', 'QUANTITY AVAILABLE': ['1 test kit', '2 test kits', '3 test kits', '4 test kits', '5 test kits'], 'UUID': '2d44c4c8', 'PRICE': '£110.00', 'REVIEWS': '4.4', 'INFORMATION': ['It’s quick, reliable and accurate to test for COVID-19 Coronavirus from MedExpress — your fully regulated, London, United Kingdom based online pharmacy.', 'Order up to 5 Coronavirus (Covid-19) home collection test kits online. The tests will be sent to you by next day delivery.', 'Use the self collection throat and nasal swabs as directed and send the kits back using the prepaid envelope from a priority post box Monday - Friday.', 'Your samples will be analysed in the lab and you will be notified of the results by SMS and email. Results can be checked online.', 'If you are experienc

In [3]:
from curses.ascii import alt
import pwd
from token import AWAIT
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, TimeoutException
from tqdm import tqdm
import datetime as datetime

import time
import uuid
import os
import json
import urllib.request
import sys
sys.path.append('../')
from src.aws import AwsScraper








class Scraper:
    """ This class contains all the navigation & data collection methods
    of a webscraper.
    
    Parameters
    ----------
    url : str 
        the url of the desired website  
    """
#to access website
    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        url = "https://www.medexpress.co.uk"
        self.driver.get(url)
        time.sleep(2)
        self.delay = 10
        self.aws = AwsScraper()
        self.drug_dictionary = {}
       
        
        # set storage location
        self.data_store = "./raw_data"

    
    def get_drug_class(self): 
    
        """
        This method gets the drug class to be scraped.

        """
        
        self.class_choice = input("Enter drug class: ")
        return self.class_choice

    def get_class_links(self, class_choice):


        """ This methods returns the link of the desired class and 
        puts them in a dictionary corresponding to their classes

        Returns
        --------
        str
            a string with the url of the class of drugs
        """
        
        # get drug class container
        class_container = self.driver.find_element(By.XPATH, '//div[@class="row margintop20"]')
        class_a_tag = class_container.find_elements(By.TAG_NAME, 'a')
        most_pop_class_tags = class_a_tag[0:8]
        drug_classes = [item.get_attribute('href') for item in most_pop_class_tags]
        # print(drug_classes)
        
        class_dictionary = {0 :'erectile dysfunction', 1 : 'covid', 2 : 'migraine', 3 : 'period delay', 4 : 'asthma', 5 : 'herpes', 6 : 'acne', 7 : 'hair loss'}

        for key, value in class_dictionary.items():
            # if class is given
            if value == class_choice.lower():
                self.drug_class = drug_classes[key]
                return self.drug_class
    
    def get_drug_links(self):
        """
        This method gets the links of all drugs in each class and the names of the drugs in the list.

        Returns
        --------
        lists
            list containing the name of drugs in the class
            list containing hyperlinks to all drugs in the class
        """
        
        
        # access class link
        self.driver.get(self.drug_class)
        time.sleep(2)
        # access drug list and links on class page 
        drug_links_container = self.driver.find_element(By.XPATH, '//div[@class="panel-treatment-row"]')
        link_tag = drug_links_container.find_elements(By.TAG_NAME, 'a')
        self.drugs_links = [item.get_attribute('href') for item in link_tag]
        return self.drugs_links

    def get_drugs_list(self):
        """
        This method gets the names of the drugs in the list.

        Returns
        --------
        lists
            list containing the name of drugs in the class
        """
        self.drugs_list = [name.split('/')[-1] for name in self.drugs_links]
        dict_self = {i : point.split('/')[-1] for i, point in enumerate(self.drugs_links)}
        return self.drugs_list
        

    def get_drug_name(self, webpage_driver):
        """
        This gets the name of the drug and updates the drug dictionary

        Args:
            webpage_driver: The webdriver for the current page,

        Returns:
            None
        """

        drug_name = webpage_driver.find_element(By.XPATH, '//div[@class="col-sm-7 product-row-title"]/h1').text
        self.drug_dictionary["DRUG NAME"] = drug_name


    def get_drug_dosage(self, webpage_driver) -> None:
        """ gets drug dosage 

        Args:
            webpage_driver (chrome_Webdriver): chrome_webdriver for that specific page
        """
        try:
            doses = webpage_driver.find_element(By.XPATH, '//ul[@class="nav nav-tabs strengthMenuTab"]')
            doze = doses.find_elements(By.XPATH, '//span[@class="tab-dosage"]')
            dosages = [items.text for items in doze]
            dose = dosages[0:3]
        except:
            dose = 'only one dose available'
        self.drug_dictionary["DOSAGES AVAILABLE"] = dose


    def get_drug_quantity(self, webpage_driver):
        """ gets the quantity of drugs needed

        Args:
            webpage_driver (chrome_Webdriver): chrome_webdriver for that specific page
        """
        quant = webpage_driver.find_element(By.XPATH, '//span[@class="select-container select-container-product"]')
        quanti = quant.find_elements(By.XPATH, '//select[@class="quantityMenu"]/option')
        quantity = [q.text for q in quanti]
        quanitities = [x for x in quantity if x != '']
        self.drug_dictionary["QUANTITY AVAILABLE"] = quanitities


    def get_unique_code(self):
        """ 
        generates a unique ID for each drug

        """
        product_id_num = str(uuid.uuid4())
        product_id_num = product_id_num[:8]
        self.drug_dictionary["UUID"] = product_id_num

    def get_price(self, webpage_driver):
        

        pri = webpage_driver.find_element(By.XPATH, '//div[@class="sitewide-sale-price-wrapper"]')
        price = pri.find_element(By.TAG_NAME, 'span')
        self.drug_dictionary["PRICE"] = price.text

    def get_drug_review(self, webpage_driver):
        """get's drug review from page

        Args:
            webpage_driver (chrome_Webdriver): chrome_webdriver for that specific page)
        """
        
        try:
            reviews = webpage_driver.find_element(By.XPATH, '//div[@class="feefo-rating-big"]/span').text
        except:
            reviews = 'No reviews available'
            
        self.drug_dictionary["REVIEWS"] = reviews

    def get_drug_info(self, webpage_driver):
        """ gets drug information from each drug page

        Args:
            webpage_driver (chrome_Webdriver): chrome_webdriver for that specific page
        """
        
        drug_inf= webpage_driver.find_element(By.XPATH, '//amp-accordion[@class="i-amphtml-element i-amphtml-layout-container i-amphtml-built i-amphtml-layout"]')
            
        drug_info = drug_inf.find_element(By.XPATH, '//div[@class="tab-pane i-amphtml-accordion-content"]')
        try:
            drun = drug_info.find_elements(By.TAG_NAME, 'p')
            drug_infoo = [drug.text for drug in drun]
        except:
            webpage_driver.implicitly_wait(10)
            drun = drug_info.find_elements(By.TAG_NAME, 'h2')
            drug_infoo = [drug.text for drug in drun]
        
        self.drug_dictionary["INFORMATION"] = drug_infoo

    def get_metadata(self, drugs_link, drug_list):
        """ This method creates different folders for data storage and gets data from each drug page
        Args:
            drug_links: the link to drugs in the class
            drug_list: the list of drugs in the class

        Returns:
            a dictionary with all the data from each drug in the class
        -----------
        directory_name : str
            a string representing the name of a new folder to be created and cd'ed into
        """

        self._create_metadata_folders(f'raw_data/{self.class_choice}')

        self.metadata_list = []
        for i in tqdm(drugs_link):
            self.driver.get(i)
            webpage_driver = self.driver
            time.sleep(2)
            self.drug_dictionary["ALTERNATIVES"] = drug_list

            # get the drug name and update the drug dictionary
            self.get_drug_name(webpage_driver=webpage_driver)
            #get drug dosage
            self.get_drug_dosage(webpage_driver)
            # get drug quantity
            self.get_drug_quantity(webpage_driver)
            # get unique code
            self.get_unique_code()
            # get drug price
            self.get_price(webpage_driver)
            # get drug review
            self.get_drug_review(webpage_driver)
            # get drug info
            self.get_drug_info(webpage_driver)
            # get drug dictionary
            self.drug_dictionary["DRUG URL"] = i
            # make a copy of the dictionary
            dictionary_copy = self.drug_dictionary.copy()

            self.metadata_list.append(dictionary_copy)
        print(self.metadata_list) 
        return self.metadata_list

   
    def save_data(self):
   
        # create the data.json from the above dictionary
        with open(f"raw_data/{self.class_choice}/data.json", "w") as f:
                
            json_output = json.dump(self.drug_dictionary, f)
            #upload to S3
            s3_url = self.aws.upload_file_method(self.class_choice)
            return(s3_url)
            
           
    

    @staticmethod  
    def _create_metadata_folders(folder_name: str):
        """This method creates different folders for data storage
        
        Parameters
        ----------
        directory_name : str
            a string representing the name of a new folder to be created and cd into
        """
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
       
    def get_image(self, drugs_link):
        """
        This Method created a folder called 'images' in the raw data folder
        scrapes images for each drug and stores them in the folder.

        Returns
        --------
        images stored as jpg files

        """
        
        for i in drugs_link:
            self.driver.get(i)
            time.sleep(2)
            # get image 
            self._create_metadata_folders(f'raw_data/images')
            image = self.driver.find_element(By.XPATH, '//div[@class="i-amphtml-carousel-scroll"]')
            image_img = image.find_elements(By.TAG_NAME, 'img')
            
            for i in image_img:
                try:
                    images_src = i.get_attribute('src') 
                    self.alt = i.get_attribute('alt')
                    urllib.request.urlretrieve(images_src, f"raw_data/images/{self.alt}.jpg")

                    s3_url = self.aws.upload_file_method(f"raw_data/images/{self.alt}.jpg", self.alt)
                    return(s3_url)
                except Exception as e:
                    print(e)
                    return None

                finally:
                    
                    if os.path.exists(f"raw_data/images/{self.alt}"):
                    
                        os.remove(f"raw_data/images/{self.alt}")


    def _quit_scraper(self):
        ''' 
        The quit_scraper function will close the scraper once the data is collected and saved.
        '''
        self.driver.quit()

    

        
        
    

if __name__ == "__main__":
    bot = Scraper()
    
    class_choice = bot.get_drug_class()
    bot.get_class_links(class_choice)
    drugs_link = bot.get_drug_links()
    drugs_list = bot.get_drugs_list()
    bot.get_metadata(drugs_link, drugs_list)
    bot.save_data()
    bot.get_image(drugs_link)
    bot._quit_scraper()


    
    




[WDM] - Current google-chrome version is 106.0.5249
[WDM] - Get LATEST chromedriver version for 106.0.5249 google-chrome
[WDM] - There is no [mac64_m1] chromedriver for browser 106.0.5249 in cache
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/106.0.5249.61/chromedriver_mac64_m1.zip


ValueError: There is no such driver by url https://chromedriver.storage.googleapis.com/106.0.5249.61/chromedriver_mac64_m1.zip

In [7]:
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, TimeoutException
from tqdm import tqdm
import datetime as datetime



class Scraper:
    """ This class contains all the navigation & data collection methods
    of a webscraper.
    
    Parameters
    ----------
    url : str 
        the url of the desired website  
    """
#to access website
    def __init__(self):
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        url = "https://www.medexpress.co.uk"
        self.driver.get(url)
        time.sleep(2)

        self.delay = 10
        self.aws = AwsScraper()
        self.drug_dictionary = {}
       
        
        # set storage location
        self.data_store = "./raw_data"

    
    def get_drug_class(self): 
    
        """
        This method gets the drug class to be scraped.

        """
        
        self.class_choice = input("Enter drug class: ")
        
        return self.class_choice

In [2]:
import numpy as np
y = np.array([2, 3, 4]).reshape(1, 3, 1, 1)
y

array([[[[2]],

        [[3]],

        [[4]]]])