# Web Scraping Amazon Product for Sentiment Analysis


For this project, I will be web scraping an Amazon product for the usage of sentiment analysis. The data will be scraped and made into a dataset that will be primarily comprised of the User's profile name, the reviewer star rating, the review, and the review of the summary. Using the VADER and Roberta model, I should be able to analyze the sentiment of User's review and compare the reviewer star rating and the sentiment of the review.


For this project, I will primarily focus on one product, [COSRX Snail Mucin](https://www.amazon.com/COSRX-Repairing-Hydrating-Secretion-Phthalates/dp/B00PBX3L7K/ref=cm_cr_arp_d_product_top?ie=UTF8), for which I will do web scraping and sentiment anaylsis on.

In [96]:
#%pip install bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from requests_html import HTMLSession



In [38]:
URL = "https://www.amazon.com/s?k=playstation+5&crid=3BV45YDFQQE2G&sprefix=playstation+4%2Caps%2C269&ref=nb_sb_noss_2"
HEADERS = ({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5332 (KHTML, like Gecko) Chrome/38.0.821.0 Mobile Safari/5332', 'Accept-Language': 'en-US, en;q=0.5'})
webpage = requests.get(URL, headers=HEADERS)
type(webpage.content)
soup = BeautifulSoup(webpage.content, "html.parser")
links = soup.find_all("a", attrs={'class':'a-size-mini a-color-base a-link-normal s-underline-text s-underline-link-text s-link-style'})
soup

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-us"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<!-- sp:feature:csm:head-open-part1 -->
<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>
<!-- sp:end-feature:csm:head-open-part1 -->
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link crossorigin="" href="https://images-na.ssl-images-amazon.com" rel="preconnect"/>
<link crossorigin="" href="https://m.media-amazon.com" rel="preconnect"/>
<link crossorigin="" href="https://completion.amazon.com" rel="preconnect"/>
<!-- sp:end-feature:cs-optimization -->
<!-- sp:feature:csm:head-open-part2 -->
<script type="text/javascript">
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.no

# Extracting Data

Data will be extracted here in a clean way by going through each page of the reviews and putting them into a dataframe


In [97]:
class Extract:
    def __init__(self, asin, page) -> None:
        self.asin = asin
        self.page = page
        
        # User Agent
        self.headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'})# 'Accept-Lanugage': 'en-US, en;q=0.5'})
        # Webpage URL
        self.reviews_url = f'https://www.tripadvisor.com/Attraction_Review-{asin}-Reviews-or{page}0'
        # HTTP Request
        self.webpage = requests.get(self.reviews_url, headers=self.headers)
        self.soup = BeautifulSoup(self.webpage.content, "lxml")
        #self.webpage = self.session.get(self.reviews_url)

    def is_page(self, page):
        print(self.soup)
        if not self.soup.find('a', attr = {'target' :'_self'}):
            print(self.soup.find('a', attr = {'target' :'_self'}))
            return False
        else:
            return self.soup.find('a', attr = {'target' :'_self'})

    # Functions to extract data

# Function to extract Product Title
    def get_product(self, review):
        try:
            # Outer Tag Object
            product = review.find('h1', attr = {'class' : 'biGQs _P fiohW ncFvv EVnyE'})

            # Inner NavigatableString Object
            product_value = product.txt

            # Title as a string value
            product_string = product.strip()

        except AttributeError:
            product_string = ""

        return product_string

# Function to extract Profile Name
    def get_profile_name(self, review):
        try:
            # Outer Tag Object
            profile = review.find('a', attr = {'target' :'_self'})

            # Profile as a string value
            profile_string = profile.text

        except AttributeError:
            profile_string = ""

        return profile_string

# Function to extract Reviewer Star Rating
    """
    def get_rating(self, review):
        try:
            rating = review.find('svg', attr = {'class' : 'UctUV d H0'})

        except AttributeError:
            rating = ''
        
        return rating
    """

# Function to extract Review

    def get_review(self, review):
        try:
            # Outer Tag Object
            review = review.find("span", attr = {"class" : "biGQs _P pZUbB KxBGd"})

            # Profile as a string value
            review_string = review.text

        except AttributeError:
            review_string = ""

        return review_string[0:25]


# Function to extract Review Summary

    def get_review_summary(self, review):
        try:
            # Outer Tag Object  #FIX
            summary = review.find('span', attr = {'class' : 'yCeTE'})

            # Profile as a string value
            summary_string = summary.text

        except AttributeError:
            summary_string = ""

        return summary_string
    
        
    def parse(self, reviews):
        all_reviews = []
        for review in reviews:
            #title_page = self.get_product(review)
            profile_name = self.get_profile_name(review)
            #rating = self.get_rating(review)
            review_string = self. get_review(review)
            review_summary = self.get_review_summary(review)

            data = {
                #'title' : title_page,
                'profile' : profile_name,
                #'rating' : rating,
                'review' : review_string,
                'review_summary' : review_summary
            }
            all_reviews.append(data)
        return all_reviews


if __name__ == '__main__':
    amz = Extract('g60763-d105125', 1)
    reviews = amz.is_page(2)
    print(reviews)
    



<!DOCTYPE html>
<html lang="en-US"><head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="en" http-equiv="content-language"/><link as="font" crossorigin="anonymous" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans-VF.woff2?v1.002" rel="preload" type="font/woff2"/><link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/><link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/><meta content="#34e0a1" name="theme-color"/><meta content="telephone=no" name="format-detection"/><meta content="TripAdvisor" property="al:ios:app_name"/><meta content="284876795" property="al:ios:app_store_id"/><meta content="284876795" name="twitter:app:id:ipad" property="twitter:app:id:ipad"/><meta content="284876795" name="twitter:app:id:iphone" property="twitter:app:id:iphone"/><meta content="tripadvisor://www.tripadvisor.com/Attraction_Review-g60

In [147]:
class Ex:
    def __init__(self, asin, page, limit = 5) -> None:
        self.asin = asin
        self.page = page
        self.limit = limit
        # User Agent
        self.headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'})# 'Accept-Lanugage': 'en-US, en;q=0.5'})
        # Webpage URL
        self.reviews_url = f'https://www.tripadvisor.com/Attraction_Review-{asin}-Reviews-or{page}0'
        # HTTP Request
        self.webpage = requests.get(self.reviews_url, headers=self.headers)
        self.session = HTMLSession()
        #self.webpage = self.session.get(self.reviews_url)

    def is_page(self):
        r = self.session.get(self.reviews_url)
        if not r.html.find('span.yCeTE', first=True):
            return False
        else:
            return r.html.find('span.yCeTE', first=True)
    

    def get_review(self, r):
        return r.html.find('div.biGQs _P pZUbB KxBGd', first = False)

    def get_review_summary(self, r):
        return r.html.find('a._blank', first = False)

    def get_rating(self, r):
        return r.html.find()

    def get_products(self, r):
        return r.html.find('h1.biGQs _P fiohW eIegw', first = True).text
    
    def parse_page(self, url):
        r = self.session.get(self.reviews_url)
        # get product

        # get review

        # get review summary

        # get rating




if __name__ == '__main__':
    amz = Ex(asin = 'g60763-d105125', page = 0)
    #print(amz.reviews_url)
    while (amz.page < 1):
        amz.is_page()
        amz.page += 1

In [133]:
headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'})
reviews_url = 'https://www.tripadvisor.com/Attraction_Review-g60763-d105125-Reviews-or10-The_Metropolitan_Museum_of_Art-New_York_City_New_York.html'
webpage = requests.get(reviews_url, headers=headers)




#print(soup.find_all('div', attr = {'class' : "_T FKffI"}))
#soup.find()

r = HTMLSession().get(reviews_url)
print(r.html.find('span.yCeTE', first=False))
for x in r.html.find('span.yCeTE', first=False):
    print(x.text)

[<Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>, <Element 'span' class=('yCeTE',)>]
Great collection of art works, if not the best collection in the world.
The Met has it all. Paintings, sculptures, and artifacts. From Picasso to Degas and Polloc to Van Gogh. All the great work