In [1]:
from bs4 import BeautifulSoup
from pathlib import Path

Base amazon url: https://www.amazon.de/SHASHIBO-Formwechsel-Zauberw%C3%BCrfel-Preisgekr%C3%B6nt-Seltenerdmagnete/dp/B07W5QM4DP/ref=cm_cr_arp_d_product_top?ie=UTF8

In [7]:
from dataclasses import dataclass, asdict
from datetime import datetime

@dataclass
class Review:
    title: str
    text: str
    rating: int
    date: datetime

    def to_file(self, path: Path):
        with open(path, 'w') as f:
            f.write(str(asdict(self)))

In [8]:
import locale
import re

def get_soup(content: str) -> BeautifulSoup:
    """Return a BeautifulSoup object from the given response"""
    soup = BeautifulSoup(content, 'html.parser')
    return soup

def has_customer_review_id(tag):
    return tag.has_attr('id') and tag['id'].startswith('customer_review')

def extract_date_str(date_element_text: str) -> str:
    date_pattern = r'\d+\.\s\w+\s\d+'
    date_match = re.search(date_pattern, date_element_text)
    if date_match:
        extracted_date = date_match.group()
        return extracted_date
    else:
        return "No date found"


def parse_to_date_object(date_str: str) -> datetime:
    locale.setlocale(locale.LC_TIME, 'de_DE')
    date_obj = datetime.strptime(date_str, "%d. %B %Y")
    return date_obj

def scrape_page(soup: BeautifulSoup, counter:int) -> int:
    reviews_divs = soup.find_all(has_customer_review_id)
    for review_div in reviews_divs:
        print(f"processing review {counter}")
        # Extract title
        title = review_div.find('a', {'data-hook': 'review-title'}).text.strip()
        
        # Extract number of stars
        stars = review_div.find('i', {'data-hook': 'review-star-rating'}).find('span').text.strip()
        
        # Extract review text - assuming it is contained in a p element with 'data-hook': 'review-body'
        review_text = review_div.find('span', {'data-hook': 'review-body'}).text.strip()


        # Extract date
        date_element_text = review_div.find('span', {'data-hook': 'review-date'}).text.strip()
        date_str = extract_date_str(date_element_text)
        date_obj = parse_to_date_object(date_str)
        

        review = Review(title=title, text=review_text, rating=stars, date=date_obj)
        review.to_file(Path(f'raw_reviews/review_{counter}.txt'))
        counter += 1

    return counter

In [9]:
content = Path('raw_pages/page1.html').read_text()
content[:100]

'<!doctype html><html lang="de-de" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start '

In [10]:
soup = get_soup(content)

scrape_page(soup, 1)

processing review 1
processing review 2
processing review 3
processing review 4
processing review 5
processing review 6
processing review 7
processing review 8
processing review 9
processing review 10


11