In [1]:
# import module
import requests
from bs4 import BeautifulSoup
from pathlib import Path

In [2]:
url = 'https://www.amazon.de/Relaxdays-Rankobelisk-beschichtetes-witterungsbest%C3%A4ndige-Rankhilfe/product-reviews/B004W1HEJ8/ref=cm_cr_arp_d_viewopt_srt?ie=UTF8&reviewerType=all_reviews&pageNumber={page}&sortBy=recent'

In [3]:
class AmazonScraper:
    HEADERS = ({'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/90.0.4430.212 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'})

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)

    def make_request(self, url) -> requests.models.Response:
        """Make a request to the given url and return the response"""
        response = self.session.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
        return response

    def get_soup(self, response: requests.models.Response) -> BeautifulSoup:
        """Return a BeautifulSoup object from the given response"""
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup



    def extract_reviews(self, soup: BeautifulSoup) -> list[str]:
        reviews = []
        outer_spans = soup.find_all('span', attrs={'data-hook': 'review-body'})
        for outer_span in outer_spans:
            try:
                inner_span = outer_span.find('span')
                review_text = inner_span.text.strip()
                reviews.append(review_text)
            except:
                print(f"skipping span {outer_span}")
        return reviews

In [4]:
def save_response_content(response: requests.models.Response, path: Path):
    with open(path, 'wb') as f:
        f.write(response.content)

In [5]:
from dataclasses import dataclass
from datetime import datetime

@dataclass
class Review:
    title: str
    text: str
    rating: int
    date: datetime

    def to_file(self, path: Path):
        with open(path, 'w') as f:
            f.write(f'title: {self.title} \n')
            f.write(f'text: {self.text} \n')
            f.write(f'rating: {self.rating} \n')

In [6]:
import locale
import re

def has_customer_review_id(tag):
    return tag.has_attr('id') and tag['id'].startswith('customer_review')

def extract_date_str(date_element_text: str) -> str:
    date_pattern = r'\d+\.\s\w+\s\d+'
    date_match = re.search(date_pattern, date_element_text)
    if date_match:
        extracted_date = date_match.group()
        return extracted_date
    else:
        return "No date found"

def parse_to_date_object(date_str: str) -> datetime:
    locale.setlocale(locale.LC_TIME, 'de_DE.utf8')
    date_obj = datetime.strptime(date_str, "%d. %B %Y")
    return date_obj


def parse_to_date_object(date_str: str) -> datetime:
    locale.setlocale(locale.LC_TIME, 'de_DE.utf8')
    date_obj = datetime.strptime(date_str, "%d. %B %Y")
    return date_obj

def scrape_page(soup: BeautifulSoup, counter:int) -> int:
    reviews_divs = soup.find_all(has_customer_review_id)
    for review_div in reviews_divs:
        print(f"processing review {counter}")
        # Extract title
        title = review_div.find('a', {'data-hook': 'review-title'}).text.strip()
        
        # Extract number of stars
        stars = review_div.find('i', {'data-hook': 'review-star-rating'}).find('span').text.strip()
        
        # Extract review text - assuming it is contained in a p element with 'data-hook': 'review-body'
        review_text = review_div.find('span', {'data-hook': 'review-body'}).text.strip()


        # Extract date
        date_element_text = review_div.find('span', {'data-hook': 'review-date'}).text.strip()
        date_str = extract_date_str(date_element_text)
        date_obj = parse_to_date_object(date_str)
        

        review = Review(title=title, text=review_text, rating=stars, date=date_obj)
        review.to_file(Path(f'raw_reviews/review_{counter}.txt'))
        counter += 1
        break

    return counter

In [7]:
scraper = AmazonScraper()

In [9]:
response = scraper.make_request(url.format(page=1))
soup = scraper.get_soup(response)

In [11]:
scrape_page(soup, 1)

1