In [1]:
# import module
import requests
from bs4 import BeautifulSoup
from pathlib import Path

In [2]:
url = 'https://www.amazon.de/Relaxdays-Rankobelisk-beschichtetes-witterungsbest%C3%A4ndige-Rankhilfe/product-reviews/B004W1HEJ8/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={page}'

In [11]:
class AmazonScraper:
    HEADERS = ({'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/90.0.4430.212 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'})

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

    def make_request(self, url) -> requests.models.Response:
        """Make a request to the given url and return the response"""
        response = self.session.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
        return response

    def get_soup(self, response: requests.models.Response) -> BeautifulSoup:
        """Return a BeautifulSoup object from the given response"""
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup



    def extract_reviews(self, soup: BeautifulSoup) -> list[str]:
        reviews = []
        outer_spans = soup.find_all('span', attrs={'data-hook': 'review-body'})
        for outer_span in outer_spans:
            try:
                inner_span = outer_span.find('span')
                review_text = inner_span.text.strip()
                reviews.append(review_text)
            except:
                print(f"skipping span {outer_span}")
        return reviews

In [16]:
def save_response_content(response: requests.models.Response, path: Path):
    with open(path, 'wb') as f:
        f.write(response.content)

In [25]:
from dataclasses import dataclass

@dataclass
class Review:
    title: str
    text: str
    rating: int

    def to_file(self, path: Path):
        with open(path, 'w') as f:
            f.write(f'title: {self.title} \n')
            f.write(f'text: {self.text} \n')
            f.write(f'rating: {self.rating} \n')

In [30]:
def has_customer_review_id(tag):
    return tag.has_attr('id') and tag['id'].startswith('customer_review')


def scrape_page(soup: BeautifulSoup, counter:int) -> int:
    reviews_divs = soup.find_all(has_customer_review_id)
    for review_div in reviews_divs:
        print(f"processing review {counter}")
        # Extract title
        title = review_div.find('a', {'data-hook': 'review-title'}).text.strip()
        
        # Extract number of stars
        stars = review_div.find('i', {'data-hook': 'review-star-rating'}).find('span').text.strip()
        
        # Extract review text - assuming it is contained in a p element with 'data-hook': 'review-body'
        review_text = review_div.find('span', {'data-hook': 'review-body'}).text.strip()

        review = Review(title=title, text=review_text, rating=stars)
        review.to_file(Path(f'raw_reviews/review_{counter}.txt'))
        counter += 1

    return counter

In [31]:
scraper = AmazonScraper()

In [33]:
counter = 0
for page in [1,2,3,4,5]:
    print(f"Scraping page {page}")
    response = scraper.make_request(url.format(page=page))
    soup = scraper.get_soup(response)
    counter = scrape_page(soup, counter)

Scraping page 1
processing review 0
processing review 1
processing review 2
processing review 3
processing review 4
processing review 5
processing review 6
processing review 7
processing review 8
processing review 9
Scraping page 2
processing review 10
processing review 11
processing review 12
processing review 13
processing review 14
processing review 15
processing review 16
processing review 17
processing review 18
processing review 19
Scraping page 3
processing review 20
processing review 21
processing review 22
processing review 23
processing review 24
processing review 25
processing review 26
processing review 27
processing review 28
processing review 29
Scraping page 4
processing review 30
processing review 31
processing review 32
processing review 33
processing review 34
processing review 35
processing review 36
processing review 37
processing review 38
processing review 39
Scraping page 5
processing review 40
processing review 41
processing review 42
processing review 43
proces