In [4]:
import requests
import numpy as np
import pandas as pd
from selectorlib import Extractor
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
import psycopg2
from sqlalchemy import create_engine

url = ["https://www.amazon.com/Fire-TV-Stick-Previous-Generation/product-reviews/B00ZV9RDKK/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber=%s",
       "https://www.amazon.com/Harry-Potter-Sorcerers-Stone-Rowling/product-reviews/059035342X/ref=cm_cr_getr_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&&sortBy=recentpageNumber=%s",
       "https://www.amazon.com/Cards-Against-Humanity-LLC-CAHUS/product-reviews/B004S8F7QM/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&&sortBy=recentpageNumber=%s",
       "https://www.amazon.com/amFilm-Screen-Protector-iPhone-Tempered/product-reviews/B01415QHYW/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&&sortBy=recentpageNumber=%s",
       "https://www.amazon.com/Mellanni-Bed-Sheet-Set-Hypoallergenic/product-reviews/B00NLLUMOE/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&&sortBy=recentpageNumber=%s"]
num_pages = 200
extractor = Extractor.from_yaml_file('selectors.yml')

In [34]:
def scrape(url):
    headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/90.0.4430.212 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})
    # Download the page using requests
    #print("Downloading %s"%url)
    r = requests.get(url, headers=headers)
    # Simple check to check if page was blocked (Usually 503)
    if r.status_code > 500:
        if "To discuss automated access to Amazon data please contact" in r.text:
            print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
        else:
            print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
        return None
    # Pass the HTML of the page and create 
    data = extractor.extract(r.text,base_url=url)
    reviews = []
    for r in data['reviews']:
        r["product"] = data["product_title"]
        r['url'] = url
        if 'verified_purchase' in r :
            if r['verified_purchase'] is None:
                r['verified_purchase'] = False
            else:
                r['verified_purchase'] = True
        r['rating'] = r['rating'].split(' out of')[0]
        date_posted = r['date'].split('on ')[-1]
        if r['images']:
            r['images'] = "\n".join(r['images'])
        r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y')
        reviews.append(r)
    histogram = {}
    for h in data['histogram']:
        histogram[h['key']] = h['value']
    data['histogram'] = histogram
    data['average_rating'] = float(data['average_rating'].split(' out')[0])
    data['reviews'] = reviews
    data['number_of_reviews'] = int(data['number_of_reviews'].replace(',','').split(' global ratings')[0])
    return data

In [35]:
def get_reviews(url, num_pages):
    url_pages = []
    for x in np.arange(1, num_pages):
        page = str(x)
        url_pages.append(url % page)
    reviews = []
    for x in range(num_pages-1):
        reviews.append(scrape(url_pages[x]))
    return reviews

In [36]:
firestick_reviews = get_reviews(url[0],num_pages)
HP_SS_reviews = get_reviews(url[1],num_pages)
CAH_reviews = get_reviews(url[2],num_pages)
IPhone_SP_reviews = get_reviews(url[3],num_pages)
BedSheets_reviews = get_reviews(url[4],num_pages)

In [57]:
FS_review = [dict(item, product_title=firestick_reviews[0]['product_title']) for item in list(np.concatenate([a_dict['reviews'] for a_dict in firestick_reviews]).flat)]
HPSS_review = [dict(item, product_title=HP_SS_reviews[0]['product_title']) for item in list(np.concatenate([a_dict['reviews'] for a_dict in HP_SS_reviews]).flat)]
CAH_review = [dict(item, product_title=CAH_reviews[0]['product_title']) for item in list(np.concatenate([a_dict['reviews'] for a_dict in CAH_reviews]).flat)]
IPSP_review = [dict(item, product_title=IPhone_SP_reviews[0]['product_title']) for item in list(np.concatenate([a_dict['reviews'] for a_dict in IPhone_SP_reviews]).flat)]
BS_review = [dict(item, product_title=BedSheets_reviews[0]['product_title']) for item in list(np.concatenate([a_dict['reviews'] for a_dict in BedSheets_reviews]).flat)]

In [61]:
df_FSR = pd.DataFrame(FS_review, columns=list(FS_review[0].keys()))
df_HPSSR = pd.DataFrame(HPSS_review, columns=list(HPSS_review[0].keys())) 
df_CAHR = pd.DataFrame(CAH_review, columns=list(CAH_review[0].keys()))
df_IPSPR = pd.DataFrame(IPSP_review, columns=list(IPSP_review[0].keys()))
df_BSR = pd.DataFrame(BS_review,columns=list(BS_review[0].keys()))

In [62]:
all_reviews = pd.concat([df_BSR,df_CAHR,df_FSR,df_HPSSR,df_IPSPR], ignore_index=True)

In [66]:
all_reviews.head()

Unnamed: 0,title,content,date,images,author,rating,found_helpful,variant,verified_purchase,product,url,product_title
0,just a college student procrastinating,These sheets are super soft and very nice. And...,10 Apr 2018,,Deja,5.0,"6,854 people found this helpful",Size: Queen Color: Royal Blue,True,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...,https://www.amazon.com/Mellanni-Bed-Sheet-Set-...,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...
1,Don’t Buy These If You Already Have Nice Sheets,These might be ok if you have no other sheets ...,20 Aug 2018,,Jillian M,1.0,"3,466 people found this helpful",Size: King Color: Quatrefoil Silver - Gray,True,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...,https://www.amazon.com/Mellanni-Bed-Sheet-Set-...,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...
2,Low quality product,We purchased two sets of twin XL for our king ...,19 Oct 2018,https://images-na.ssl-images-amazon.com/images...,Amazon Customer,1.0,"1,284 people found this helpful",Size: Twin XL Color: Light Gray,True,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...,https://www.amazon.com/Mellanni-Bed-Sheet-Set-...,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...
3,"Holy sheet, what a surprise.",I have a hard time finding sheets I like. They...,08 Oct 2017,,Pragmatic Knee,5.0,"1,831 people found this helpful",,True,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...,https://www.amazon.com/Mellanni-Bed-Sheet-Set-...,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...
4,I'm a Microfiber Convert,Despite a highly positive recommendation from ...,27 Feb 2018,,D. T. Miller,5.0,"2,973 people found this helpful",Size: Queen Color: Gold,True,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...,https://www.amazon.com/Mellanni-Bed-Sheet-Set-...,Mellanni Queen Sheet Set - Hotel Luxury 1800 B...
