In [1]:
from bs4 import BeautifulSoup
import re
import requests
from PIL import Image
from io import BytesIO

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv
import html

import pandas as pd
import numpy as np

In [2]:

# Precompile regex
price_regex = re.compile(".*beans-price__text$")

# Configure webdriver
options = Options()
#options.headless = True # I tried to hide the GUI but Tesco detects and blocks this
options.add_argument("--window-size=1920,1080")  # set window size to native GUI size
options.add_argument("start-maximized")  # ensure window is full-screen
options.add_argument("--disable-blink-features=AutomationControlled")


In [3]:
# Function to save image from URL
def save_image(urls, file_path):
    try:
        for url in urls: 
            decoded_url = html.escape(url)

            # print(file_path, 'trying url:', decoded_url)
            try:
                response = requests.get(decoded_url)

            except Exception as e:
                print(f"{file_path} Failed to save image for an url: \n   {url} \nwhere all urls are: \n   {urls}. \n Error: {e}")

            if response.status_code == 200:
                image = Image.open(BytesIO(response.content))
                image.save(file_path, format="PNG")
                # print(file_path, 'successfully saved image' )
                return response, image
            else:
                pass
                # print(file_path, 'response: ', response.status_code, response.text)

            
        print(file_path, f"Failed to save image all urls are tried: {urls}! ")
        return None, None

    except Exception as e:
        print(file_path, f"Failed to save image from {urls}. Error: {e}")
        return None, None
        
        

In [4]:
def add_line_to_csv_file(file_path, data):

    # Open the CSV file in append mode
    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data)
    

In [None]:
data_file_path = 'product_data.csv'

category_list = ['marketplace', 'summer', 'fresh-food', 'bakery', 
                 'frozen-food', 'treats-and-snacks', 'food-cupboard', 'drinks']

# Load page
driver = webdriver.Chrome(options=options)

for _category in category_list:
    print(f'Starting {_category}', 'category')
    should_continue = True
    _page = 1
    while should_continue:
        _page_start_time = time.time()
        driver.get(f"https://www.tesco.com/groceries/en-GB/shop/{_category}/all?page={_page}")
        _page += 1
        time.sleep(5) # Let the page load

        # Parse page
        html_driver = driver.page_source
        soup = BeautifulSoup(html_driver, 'html.parser')

        # Get the product grid
        grid = soup.find("ul", attrs={"data-auto":"product-list"})
        
        if grid == None:
            should_continue = False
            break
            
        products = grid.find_all("li")

        # For each product...
        for i, product in enumerate(products):
            try:
                # Zoom in
                try:
                    product = product.div.div.div.div # The relevant info is 4 divs deep
                except:
                    continue
                
                # Product id
                product_id = product.find('a').get('href').split('/')[-1].strip()
                
                # Product name
                name = product.find("a", attrs={"data-auto":"product-tile--title"}).span.get_text().strip()

                # Product price
                try:
                    price = product.find("p", attrs={"class":price_regex}).get_text().strip()[1:] # remove the £
                except:
                    price = None
                    
                # Category info:
                rest_of_shelf_links = product.select('a[href*="/groceries/en-GB/shop/"]')

                category_info = None
                for href_link in rest_of_shelf_links:
                    try:
                        category_info = href_link.get('href')
                        break
                    except:
                        pass
                
                # Print for debug
                data_to_save = {
                                'product_id': product_id, 
                                'name': name, 
                                'price': price, 
                                'category_info': category_info,
                                'broad_category': _category, 
                                'page': _page
                }
                add_line_to_csv_file(data_file_path, list(data_to_save.values()))
                
                try:
                    # Save image
                    # Image URL
                    image_urls = product.img.get('srcset').split(", ")
                    
                    file_path = f"product_images/{product_id}.png"
                    img_response, image = save_image(image_urls, file_path)
                except Exception as e:
                    print(f'Error while getting the image for product id {product_id}! ', e) 

            except Exception as e:
                print("Scraping failed. Error:", e)
                
        print(_category, 'category page', _page -1, 'completed for ', i, 'products!', 
              'time_cost for the page:', time.time() - _page_start_time, 'seconds')
        

driver.quit()


Starting marketplace category
marketplace category page 1 completed for  38 products! time_cost for the page: 10.153500080108643 seconds
marketplace category page 2 completed for  27 products! time_cost for the page: 12.301840782165527 seconds
marketplace category page 3 completed for  31 products! time_cost for the page: 11.145864963531494 seconds
marketplace category page 4 completed for  31 products! time_cost for the page: 9.545467853546143 seconds
marketplace category page 5 completed for  27 products! time_cost for the page: 9.468164205551147 seconds
marketplace category page 6 completed for  25 products! time_cost for the page: 9.480066061019897 seconds
marketplace category page 7 completed for  27 products! time_cost for the page: 11.17659592628479 seconds
marketplace category page 8 completed for  27 products! time_cost for the page: 17.993318796157837 seconds
marketplace category page 9 completed for  29 products! time_cost for the page: 9.516498804092407 seconds
marketplace 

marketplace category page 70 completed for  31 products! time_cost for the page: 20.040951251983643 seconds
marketplace category page 71 completed for  29 products! time_cost for the page: 18.755363941192627 seconds
marketplace category page 72 completed for  23 products! time_cost for the page: 21.559303998947144 seconds
marketplace category page 73 completed for  27 products! time_cost for the page: 18.17423391342163 seconds
marketplace category page 74 completed for  27 products! time_cost for the page: 19.007956981658936 seconds
marketplace category page 75 completed for  23 products! time_cost for the page: 20.313886880874634 seconds
marketplace category page 76 completed for  33 products! time_cost for the page: 21.304385900497437 seconds
marketplace category page 77 completed for  23 products! time_cost for the page: 19.443281173706055 seconds
marketplace category page 78 completed for  29 products! time_cost for the page: 20.35775899887085 seconds
marketplace category page 79 c

marketplace category page 125 completed for  25 products! time_cost for the page: 28.982837915420532 seconds
marketplace category page 126 completed for  23 products! time_cost for the page: 20.541521072387695 seconds
marketplace category page 127 completed for  27 products! time_cost for the page: 22.733927965164185 seconds
marketplace category page 128 completed for  31 products! time_cost for the page: 24.334196090698242 seconds
marketplace category page 129 completed for  27 products! time_cost for the page: 22.871845960617065 seconds
marketplace category page 130 completed for  25 products! time_cost for the page: 22.782976865768433 seconds
marketplace category page 131 completed for  27 products! time_cost for the page: 18.600369930267334 seconds
product_images/325091626.png Failed to save image for an url: 
   data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw== 4000w 
where all urls are: 
   ['https://digitalcontent.api.tesco.com/v2/media/ghs-mktg/b0b04216

marketplace category page 172 completed for  23 products! time_cost for the page: 24.944249153137207 seconds
marketplace category page 173 completed for  29 products! time_cost for the page: 21.00968289375305 seconds
marketplace category page 174 completed for  27 products! time_cost for the page: 19.632494926452637 seconds
marketplace category page 175 completed for  26 products! time_cost for the page: 20.727362155914307 seconds
marketplace category page 176 completed for  27 products! time_cost for the page: 20.693253993988037 seconds
marketplace category page 177 completed for  23 products! time_cost for the page: 20.06188988685608 seconds
product_images/325088484.png Failed to save image for an url: 
   data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw== 4000w 
where all urls are: 
   ['https://digitalcontent.api.tesco.com/v2/media/ghs-mktg/b0b04216-fa73-466d-a9d7-c9fcfa1ce9b3/no-image.jpeg 768w', 'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAA