In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time

In [2]:
products = [
    ('Beats-Solo3-Wireless-Ear-Headphones', 'B01M0J9OD3'),
    ('Sony-Cancelling-Bluetooth-Headphone-MDR1000X', 'B01KHZ4ZYY'),
    ('beyerdynamic-770-closed-Studio-Headphone', 'B0016MNAAI'),
    ('Sennheiser-Open-Back-Professional-Headphone', 'B00004SY4H'),
    ('Bose-QuietComfort-Acoustic-Cancelling-Headphones', 'B00M1NEUKK'),
    ('Harman-Kardon-CL-Precision-Headphones', 'B00A3RVNXI'),
    ('Samsung-Level-Over-Ear-Bluetooth-Headphone', 'B00KGGK71A'),
    ('V-MODA-Crossfade-Over-Ear-Noise-Isolating-Headphone','B00A39PPI0')
]

In [3]:
amazon_product = pd.DataFrame(products, columns = ['productname','productid'])

In [9]:
amazon_product.to_csv('amazon_product.csv', index = None, encoding='utf-8')

In [5]:
def get_lastpage(url):
    page = requests.get(url)
    return int(re.findall('pageNumber=[0-9]+',page.content)[-2].replace('pageNumber=',''))

def get_reviews(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    user_raw = soup.find_all('a', class_ = 'a-size-base a-link-normal author')
    time_raw = soup.find_all('span', class_ = 'a-size-base a-color-secondary review-date')
    rating_raw = soup.find_all('span', class_ = 'a-icon-alt')
    review_raw = soup.find_all('span', class_ = 'a-size-base review-text')
    
    
    userids = [u.text.strip() for u in user_raw]
    times = [t.text.strip()[3:] for t in time_raw][2:] #start from 4th letter to get rid of "on", get rid of first non-review time
    ratings = [float(r[:3]) for r in [r.text.strip() for r in rating_raw] if r != '|'][3:-5] #get rid of non-review ratings (e.g. overall product rating and similar product rating on the same page)
    reviews = [r.text.strip() for r in review_raw]
    
    return zip(userids,times,ratings,reviews)

In [6]:
amazon_review = pd.DataFrame()

for (p,i) in products:
    base_url = 'https://www.amazon.com/' + p + '/product-reviews/' + i + '/ref=cm_cr_arp_d_paging_btm_1?ie=UTF8&reviewerType=all_reviews&pageNumber='
    tot_page = get_lastpage(base_url + '1') #get total page number from first page
    urls = [base_url + '{}'.format(x) for x in range(1,tot_page + 1)] #get list of urls for all review pages
    reviews = []
    for url in urls:
        r = get_reviews(url)
        reviews += r
    df = pd.DataFrame(reviews, columns = ['userid','time','rating','review'])
    df['time'] = pd.to_datetime(df['time']).map(lambda x: x.date())
    df['productname'] = p
    df['productid'] = i
    amazon_review = amazon_review.append(df, ignore_index = True)

In [7]:
amazon_review.to_csv('amazon_review.csv', index = None, encoding='utf-8')

In [8]:
amazon_review

Unnamed: 0,userid,time,rating,review,productname,productid
0,Kevin V Sr,2016-10-21,4.0,Heads-up - this is a lengthy review - but when...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
1,MindSeedTV,2016-10-10,4.0,Great pair of Headphones! I really love the q...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
2,Big Dutch,2016-10-21,5.0,Disclaimer: I've never owned a pair of headpho...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
3,c0deName_ZERO,2016-11-28,4.0,Ladies & Gents this Review is best suited for ...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
4,Amazon Customer,2016-12-27,5.0,"Crazy fast charging, wireless, comfortable on ...",Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
5,Dan,2017-03-16,1.0,"Buyer Beware!!! Thrilled with the price, I pu...",Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
6,KIMBERLY N.,2016-12-30,5.0,LOVE LOVE LOVE!! These are my first pair and t...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
7,Shepherd luvr,2017-10-01,5.0,After looking at numerous headphones my wife s...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
8,Jeremy,2017-04-06,4.0,huge bass enhancement ( you can definitely tel...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
9,Mckaela Schmidt,2017-05-03,5.0,I bought these for my boyfriend as a surprise ...,Beats-Solo3-Wireless-Ear-Headphones,B01M0J9OD3
