In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Custom headers

custom_headers = {
    "Accept-language": "en-GB,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}


In [3]:
def get_soup(url):
    response = requests.get(url, headers=custom_headers)

    if response.status_code != 200:
        print("Error in getting webpage")
        exit(-1)

    soup = BeautifulSoup(response.text, "lxml")
    return soup

def get_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_author_element = review.select_one("span.a-profile-name")
        r_author = r_author_element.text if r_author_element else None

        r_rating_element = review.select_one("i.review-rating")
        r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None

        r_title_element = review.select_one("a.review-title")
        r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
        r_title = r_title_span_element.text if r_title_span_element else None

        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None

        r_date_element = review.select_one("span.review-date")
        r_date = r_date_element.text if r_date_element else None

        r_verified_element = review.select_one("span.a-size-mini")
        r_verified = r_verified_element.text if r_verified_element else None

        r = {
            "author": r_author,
            "rating": r_rating,
            "title": r_title,
            "content": r_content,
            "date": r_date,
            "verified": r_verified
        }

        scraped_reviews.append(r)

    return scraped_reviews


In [4]:
search_url = "https://www.amazon.com/BERIBES-Cancelling-Transparent-Soft-Earpads-Charging-Black/product-reviews/B0CDC4X65Q/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
soup = get_soup(search_url)
data = get_reviews(soup)
df = pd.DataFrame(data=data)


In [5]:
df

Unnamed: 0,author,rating,title,content,date,verified
0,James Kirk,5.0,Best headphones for the $,"\nComfortable, feel sturdy, anc works great, p...","Reviewed in the United States on April 20, 2024",Verified Purchase
1,Tri,5.0,These headphones are a bargain for under thirty,\nLiterally had to write this review after get...,"Reviewed in the United States on March 30, 2024",Verified Purchase
2,Jan N,4.0,Very good headphones,\nI like these because they are lightweight an...,"Reviewed in the United States on April 11, 2024",Verified Purchase
3,Jaci,5.0,Very happy with these,\nGreat value (so far). Sound quality is good ...,"Reviewed in the United States on May 4, 2024",Verified Purchase
4,Customer,5.0,Punches above it's weight,"\nWhen I bought this, I wasn't expecting much....","Reviewed in the United States on May 3, 2024",Verified Purchase
5,rabar,3.0,You get what you pay for!,\nI really like the price of these headphones....,"Reviewed in the United States on May 8, 2024",Verified Purchase
6,"""Badger""",5.0,"Great, comfortable noise cancelling headset.","\nFor the price, I thought I'd give it a try. ...","Reviewed in the United States on November 1, 2023",Verified Purchase
7,Smart Shopper,5.0,"Great for the price, no bass, adequate noise c...",\nI needed some headphones for mowing the lawn...,"Reviewed in the United States on April 26, 2024",Verified Purchase
8,Amazon Customer,5.0,"No need to read, just buy it now!",\nI have several pairs of more expensive headp...,"Reviewed in the United States on May 1, 2024",Verified Purchase
9,Intabus,4.0,Very good sound. Somewhat fragile.,"\nI'm not an audiophile, so I am not very know...","Reviewed in the United States on April 16, 2024",Verified Purchase


In [6]:
df.to_csv("amazon.csv")