## 1. Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## 2. Function to Scraping Data

#### Scraping data to get name, year, price etc of cars
#### WebAddress: https://www.cazoo.co.uk/cars/

In [2]:
def cazoo_web_scraping(pages):

    all_data = []
    
    # Pagination
    for p in range(1, pages+1):
        if p == 1:
            full_url = 'https://www.cazoo.co.uk/cars/'
        else:
            full_url = f'https://www.cazoo.co.uk/cars/?page={p}'

        # Request to web
        response = requests.get(full_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find list of cars
        my_list = soup.find('ul', attrs={'class': 'grid min-h-96 grid-cols-1 content-start gap-4'})
        cars = my_list.find_all('li')

        for features in cars:
            try:
                # Get Price and clean it to save as digit
                price_tag = features.find('span', attrs={'class':'c-text-lg-semi-bold md:c-heading-xl'})
                price = price_tag.get_text() if price_tag else None
                price_clean = price.replace('£', '').replace(',', '')
                price_clean = int(price_clean) if price_clean.isdigit() else None

                # get brands- for brands with 2 or more section --> multi_word_brands
                multi_word_brands = ["Land Rover", "Alfa Romeo", "Mercedes Benz", "Aston Martin", "Rolls Royce", "Mini Cooper"]
                bm_tag = features.find('p', attrs={'data-testid': 'vehicleTitle'})
                bm = bm_tag.get_text() if bm_tag else None

                brand = None
                model = None
                if bm:
                    found = False
                    for b in multi_word_brands:
                        if bm.startswith(b):
                            brand = b
                            model = bm[len(b):].strip()
                            found = True
                            break
                    
                    if not found:
                        section = bm.split(' ', 1)
                        brand = section[0]
                        model = section[1] if len(section) > 1 else None

                # year and clean it to save as digit
                attrebutes_tag = features.find('p', attrs={'class': 'c-text-sm line-clamp-1 leading-tight'})
                attrebutes = attrebutes_tag.get_text() if attrebutes_tag else None
                parts = attrebutes.split(' ', 1)
                year = parts[0] if len(parts) > 0 else ''
                other_attr = parts[1] if len(parts) > 1 else ''
                year_clean = int(year) if year.isdigit() else None

                # Mile, Power, Manual/Auto, Energy
                properties = features.find('div', attrs={'class': 'flex flex-wrap items-center gap-2 pt-1 pb-3 lg:pt-0'})
                miles_tag = properties.find_all('div')[0]
                miles = miles_tag.get_text() if miles_tag else None
                miles_clean = int(miles.lower().replace('k', '').split()[0])
                register_tag = properties.find_all('div')[1]
                register = register_tag.get_text() if register_tag else None
                manAuto_tag = properties.find_all('div')[2]
                manAuto = manAuto_tag.get_text() if manAuto_tag else None
                fuel_tag = properties.find_all('div')[3]
                fuel = fuel_tag.get_text() if fuel_tag else None

                # Location
                location_tag = features.find('div', attrs={'class': 'c-text-xs flex items-center space-x-1'}).find('span')
                location = location_tag.get_text() if location_tag else None

                # Add all features to list
                all_data.append({
                    'Brand': brand,
                    'Model': model,
                    'Year': year_clean,
                    'Attrebutes': other_attr,
                    'Miles(k miles)': miles_clean,
                    'Registration': register,
                    'Control': manAuto,
                    'Fuel': fuel,
                    'Location': location,
                    'Price(£)': price_clean
                })

            except Exception as e:
                print(f'Error:{str(e)}')

    # Save file as CSV
    df = pd.DataFrame(all_data)
    df.to_csv('cazooCarsDataset.csv', index=False, encoding='utf-8')
    print('Data Saved in cazooCarsDataset.csv!')

## 3. Get Data

In [3]:
cazoo_web_scraping(pages=1000)

Error:invalid literal for int() with base 10: '46+'
Error:invalid literal for int() with base 10: '72.8+'
Error:invalid literal for int() with base 10: '41.2+'
Error:invalid literal for int() with base 10: '68+'
Error:invalid literal for int() with base 10: '103.3+'
Error:invalid literal for int() with base 10: '67.6+'
Error:invalid literal for int() with base 10: '99.4+'
Error:invalid literal for int() with base 10: '110.3+'
Error:invalid literal for int() with base 10: '100.9+'
Error:invalid literal for int() with base 10: '103+'
Error:invalid literal for int() with base 10: '49.2+'
Error:invalid literal for int() with base 10: '9.2+'
Error:invalid literal for int() with base 10: '12.6+'
Error:invalid literal for int() with base 10: '122.4+'
Error:invalid literal for int() with base 10: '53.5+'
Error:invalid literal for int() with base 10: '123+'
Error:invalid literal for int() with base 10: '19.8+'
Error:invalid literal for int() with base 10: '87+'
Error:invalid literal for int() w