In [1]:
# Essential Imports

import requests # To get the page content
from bs4 import BeautifulSoup as soup # To search and present page content
from datetime import datetime as dt # Managing date formats
import pandas as pd # Presenting data in CSV format
import re # Regex for some string extraction (can replace soup easily)


In [2]:
# Get the pages based on search term or product page url

BASE_URL = "https://www.amazon.co.uk/s?k="

PAGE_URL = "https://www.amazon.com/dp"

# These headers the scraper look like a normal user rather than a bot

headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/47.0.2526.106 Safari/537.36'
        }

# Defining methods means we can use the code again without re-writing
def get_search_results(path):
    
    # The path . replace is used to remove ' ' from the search query
    
    encoded_path = path.replace(" ", "+")
    
    # requests is used to actually access the site and retrieve the HTML

    page = requests.get(BASE_URL + encoded_path, headers=headers)
    
    # Soup (BeautifulSoup) allows for easy navigation and presentation
    
    search_results = soup(page.content, 'html.parser')

    return search_results


def get_product_page(page):
    
    html_page = requests.get(page, headers=headers)
    
    product_page = soup(html_page.content, 'html.parser')
    
    return product_page


In [3]:
def get_first_page_products(query):

    search_results = get_search_results(query)
    
    # This is a BeautifulSoup search that looks for specific elements - returns a list
    products = search_results.select("span[class=rush-component]")
    
    # the [] defines a list of items that we can iterate through
    search_products = []

    x = 0

    for product in products: # For every item in list (match for our select)

        if str(product).__contains__('img'): # if the item in list contains text "img"
            x += 1
            try:
                search_products.append( # append adds the below (a dictionary) to the list
                    {
                        'id': x,
                        # Select is similar to a regex but using elements specifically
                        'link': product.select("a[class=a-link-normal]")[0]['href'],
                        'name': product.select("img[class=s-image]")[0]['alt'],
                        'image': product.select("img[class=s-image]")[0]['src'],
                        'asin': re.findall(
                            ".*dp/(.*?)/", # The regex pattern we're looking to match
                            product.select("a[class=a-link-normal]")[0]['href']
                        )[0] # [0] because re.findall returns a list but there's only 1
                    }
                )
            except IndexError: 
                # The try and except handles the script erroring if there's no match
                pass

    return search_products      


In [47]:
def create_dataset(query):
    pages = get_first_page_products(query)

    product_reviews = []

    for page in pages:

        url = (

            PAGE_URL 
            + "/product-reviews/"
            + page['asin'] 
            + "/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews"
        )

        full_page = get_product_page(url)

        pattern = re.compile("data-hook=\"review(.*?)review_comment_expander", re.MULTILINE|re.DOTALL)
        reviews = re.findall(pattern, str(full_page))
        
        try:

            product_details = {
                # Re is the python regex function
                    'currency': re.findall(".*a-color-price arp-price\"\>(.).*?\<", str(full_page))[0],
                    'price': re.findall(".*a-color-price arp-price\"\>.(.*?)\<", str(full_page))[0]
            }
        except IndexError: # If the regex doesn't find anything we skip it, rather than erroring.
            pass

        for review in reviews:

            try:
                product_reviews.append(
                    {
                        'asin': page['asin'],
                        'name': page['name'],
                        'link': page['link'],
                        'image': page['image'],
                        'currency': product_details['currency'],
                        'price': product_details['price'],
                        'date': dt.now(),
                        'rating': re.findall("a-icon-alt\"\>(.*?) out", review, re.MULTILINE|re.DOTALL)[0],
                        'text': re.findall("review-text-content\"><span class=\"\">(.*)</span>", review, re.MULTILINE|re.DOTALL)[0],
                        'upvotes': re.findall("<span class=\"review-votes\">(.*?)people", review, re.MULTILINE|re.DOTALL)
                    }
                )
            except IndexError:
                pass

    return product_reviews

In [48]:
query = input("Search for Amazon product: ")
create_dataset(query)

Search for Amazon product: Samsung Note 10


[{'asin': 'B07GDTHH2H',
  'name': 'Samsung Galaxy Note 9 Dual SIM 128 GB Ocean Blue 6.4-Inch Sim-Free Smartphone',
  'link': '/Samsung-Galaxy-6-4-Inch-Sim-Free-Smartphone/dp/B07GDTHH2H/ref=sr_1_4?keywords=Samsung+Note+10&qid=1566820118&s=gateway&sr=8-4',
  'image': 'https://m.media-amazon.com/images/I/71kXsVxAyEL._AC_UY218_.jpg',
  'currency': '$',
  'price': '739.97',
  'date': datetime.datetime(2019, 8, 26, 12, 48, 46, 56006),
  'rating': '5.0',
  'text': 'The phone worked perfectly my wife loves it thank you so much</span>\n            ',
  'upvotes': []},
 {'asin': 'B07GDTHH2H',
  'name': 'Samsung Galaxy Note 9 Dual SIM 128 GB Ocean Blue 6.4-Inch Sim-Free Smartphone',
  'link': '/Samsung-Galaxy-6-4-Inch-Sim-Free-Smartphone/dp/B07GDTHH2H/ref=sr_1_4?keywords=Samsung+Note+10&qid=1566820118&s=gateway&sr=8-4',
  'image': 'https://m.media-amazon.com/images/I/71kXsVxAyEL._AC_UY218_.jpg',
  'currency': '$',
  'price': '739.97',
  'date': datetime.datetime(2019, 8, 26, 12, 48, 46, 56080),
 