# Web Scraping Amazon Product for Sentiment Analysis


For this project, I will be web scraping an Amazon product for the usage of sentiment analysis. The data will be scraped and made into a dataset that will be primarily comprised of the User's profile name, the reviewer star rating, the review, and the review of the summary. Using the VADER and Roberta model, I should be able to analyze the sentiment of User's review and compare the reviewer star rating and the sentiment of the review.


For this project, I will primarily focus on one product, [COSRX Snail Mucin](https://www.amazon.com/COSRX-Repairing-Hydrating-Secretion-Phthalates/dp/B00PBX3L7K/ref=cm_cr_arp_d_product_top?ie=UTF8), for which I will do web scraping and sentiment anaylsis on.

In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
!pip install requests_html

Defaulting to user installation because normal site-packages is not writeable


# Extracting Data

Data will be extracted here in a clean way by going through each page of the reviews and putting them into a dataframe


In [74]:
from requests_html import HTMLSession

class Extract:
    def __init__(self, asin) -> None:
        self.asin = asin
        # User Agent
        self.headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Accept-Lanugage': 'en-US, en;q=0.5'})
        # Webpage URL
        self.url = "https://www.amazon.com/COSRX-Repairing-Hydrating-Secretion-Phthalates/dp/B00PBX3L7K/ref=cm_cr_arp_d_product_top?ie=UTF8"
        self.reviews_url = f'https://www.amazon.com/product-reviews/{self.asin}/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber='
        self.webpage = requests.get(self.reviews_url, headers = self.headers)
        self.session = HTMLSession() #BeautifulSoup(webpage.content, "html.parser")

    def pagination(self, page):
        webpage = self.session.get(self.reviews_url + str(page))
        if not webpage.html.find('div[data-hook=review]'):
            return False
        else:
            return webpage.html.find('div[data-hook=review]')

    # Functions to extract data

# Function to extract Product Title
    def get_product(soup):
        try:
            # Outer Tag Object
            product = webpage.html.find('span[class=a-size-large product-title-word-break]')

            # Inner NavigatableString Object
            product_value = product.txt

            # Title as a string value
            product_string = product.strip()

        except AttributeError:
            product_string = ""

        return product_string

# Function to extract Profile Name
    def get_profile_name(soup):
        try:
            # Outer Tag Object
            profile = webpage.html.find('span[class':'a-profile-name]')

            # Profile as a string value
            profile_string = profile.text

        except AttributeError:
            profile_string = ""

        return profile_string

# Function to extract Reviewer Star Rating

    def get_rating(soup):
        try:
            rating = soup.find('i', attrs = {'data-hook':'review-star-rating'}).text

        except AttributeError:
            rating = ''
        
        return rating

# Function to extract Review

    def get_review(soup):
        try:
            # Outer Tag Object
            review = webpage.html.find('i', attrs = {'data-hook':'a-size-base review-text review-text-content'})

            # Profile as a string value
            review_string = review.text

        except AttributeError:
            review_string = ""

        return review_string[0:25]

# Function to extract Review Summary

    def get_review_summary(soup):
        try:
            # Outer Tag Object  #FIX
            summary = soup.find('span', attrs = {'class':'a-letter-space'})

            # Profile as a string value
            summary_string = summary.text

        except AttributeError:
            summary_string = ""

        return summary_string
        
    def parse(self, reviews):
        all_reviews = []
        for review in reviews:
            title_page = get_product(soup)
            profile_name = get_profile_name(soup)
            rating = get_rating(soup)
            review_string = get_review(soup)
            review_summary = get_review_summary(soup)

            data = {
                'title' : title_page,
                'profile' : profile_name,
                'rating' : rating,
                'review' : review_string,
                'review_summary' : review_summary
            }
            all_reviews.append(data)
        return all_reviews


if __name__ == '__main__':
    amz = Extract('B00PBX3L7K')
    reviews = amz.pagination(1)
    print(amz.parse(reviews))



TypeError: BaseParser.find() got an unexpected keyword argument 'attrs'