In [13]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time, datetime
import pandas as pd
import os
import boto3



#------------- Scraper Class---------------
class Scraper: 

    #------------- Initiate Chrome Browser---------------
    def __init__(self,url:str = 'https://www.hotukdeals.com/tag/electronics?page=1'):
        self.listofdict = list()
        options = webdriver.ChromeOptions() 
        options.add_argument("start-maximized")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--disable-blink-features=AutomationControlled')
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        options.add_experimental_option("prefs",prefs)
        self.driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
        self.driver.get(url)

    #------------- Accept the Cookies---------------
    def accecept_cookies(self):
        try:
            time.sleep(5)
            self.driver.find_element(By.XPATH,'//button[@data-t="acceptAllBtn"]').click()
        except TimeoutException:
            print('No Cookies Found')

    #------------- Data Scrape from Container & Move to Next Page---------------
    def find_container(self):
        driver = self.driver
        time.sleep(5)
        #count the number of pages
        count = 0
        for pagenum in range(1,501):
            driver.get(f'https://www.hotukdeals.com/tag/electronics?page={pagenum}')
            time.sleep(5)
            #list the page container
            productcontainer = driver.find_elements(By.XPATH, '//article[@data-t="thread"]')
            print(len(productcontainer))        
            for singleproduct in productcontainer: 
                #dict to store data temporary          
                datadict = dict()
                try:datadict['Image'] = singleproduct.find_element(By.XPATH, './/img').get_attribute('src')
                except:datadict['Image'] = ''
                try:datadict['Title'] = singleproduct.find_element(By.XPATH, './/strong[@class="thread-title "]/a').text
                except:datadict['Title'] = ''
                try:datadict['Link'] = singleproduct.find_element(By.XPATH, './/a[@rel="nofollow noopener"]').get_attribute('href')
                except:
                    try:datadict['Link'] = singleproduct.find_element(By.XPATH, './/a[@rel="nofollow"]').get_attribute('href')
                    except:datadict['Link'] = ''
                try:datadict['Price'] = singleproduct.find_element(By.XPATH, './/span[@class="overflow--wrap-off"]/span').text
                except:datadict['Price'] = ''
                print(datadict)
                #append data in main list
                self.listofdict.append(datadict)
            #add the value for page number counting
            count+= 1
            #here you can set number of pages
            if count == 1:
                break
        time.sleep(5)
        driver.quit()
    
    #------------- Save Data ---------------
    def save_data(self):
        date = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
        df = pd.DataFrame.from_dict(self.listofdict)
        print('\n\n  ************DATA FRAME**************')
        print(df)
        df.to_csv(f'HotukDeals-{date}.csv',index=False)
        df.to_json(f'HotukDeals-{date}.json')        
        print('Saved Data in CSV!!')
    
    def upload_data(self):
        s3 = boto3.client('s3')
        s3.upload_file('HotukDeals-{date}.csv','dealsuk2022', 'Hotukdeals-{date}.csv')
        
            



if __name__ == "__main__":
    bot = Scraper()
    bot.accecept_cookies()
    try:
        bot.find_container()
        bot.save_data()
        bot.upload_data()
    except:
        bot.save_data()




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/home/zain/.wdm/drivers/chromedriver/linux64/102.0.5005.61/chromedriver] found in cache
  self.driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)


20
{'Image': 'https://images.hotukdeals.com/threads/raw/j2sgo/3955168_1/re/300x300/qt/60/3955168_1.jpg', 'Title': 'BRAIDED 20AWG USB PD Type C To 8 Pin Lightning FAST CHARGE Charging Cable 1m - £3.63 Delivered @ kenable', 'Link': 'https://www.hotukdeals.com/visit/thread/3955168', 'Price': '£3.63'}
{'Image': 'https://images.hotukdeals.com/threads/raw/MMmPg/3955147_1/re/300x300/qt/60/3955147_1.jpg', 'Title': 'MyMemory 128GB USB 3.0 Flash Drive - Red - 100MB/s - £9.99 @ MyMemory', 'Link': 'https://www.hotukdeals.com/visit/thread/3955147', 'Price': '£9.99'}
{'Image': 'https://images.hotukdeals.com/threads/raw/zmiQ5/3955099_1/re/300x300/qt/60/3955099_1.jpg', 'Title': 'Sony SRS-RA3000 Home Speaker with Wi-Fi - £189 @ John Lewis', 'Link': 'https://www.hotukdeals.com/visit/thread/3955099', 'Price': '£189'}
{'Image': 'https://images.hotukdeals.com/threads/raw/yR5D0/3955041_1/re/300x300/qt/60/3955041_1.jpg', 'Title': 'Belkin RT3200 WiFi 6 Router AX3200 - £49.98 @ Amazon', 'Link': 'https://www.ho