# BeautifulSoupとseleniumを使ってamazonをスクレイピング

In [1]:
# キーワードと検索するページ数の設定
keyword = 'スクレイピング'
num = 3

In [2]:
# ライブラリのインポート
import csv
import datetime
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import numpy as np
import pandas as pd

In [3]:
# 検索結果の初期ページ
driver_path = ChromeDriverManager().install()
options = Options()
options.add_argument("--headless")
service = Service(executable_path=driver_path)

browser = webdriver.Chrome(options=options, service=service)
browser.implicitly_wait(10)
browser.get('http://amazon.co.jp/')
browser.implicitly_wait(10)
search = browser.find_element(By.ID, 'twotabsearchtextbox')
browser.implicitly_wait(10)
search.send_keys(keyword)
browser.implicitly_wait(10)
search.submit()
browser.implicitly_wait(10)

In [4]:
pages = np.arange(1, num + 1)
all_item_name = []
all_price = []
all_image = []
all_link = []

for p in pages:

    # urlの取得
    url = browser.execute_script("return window.location.href")
    browser.implicitly_wait(10)

    # ダミーのヘッダー情報を使ってsoupを取得
    hdr = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'ja-JP,en-US;q=0.7,en-GB;q=0.3',
        'Connection': 'keep-alive'} #ダミーのヘッダー情報
    req = Request(url, headers=hdr)
    sleep(1)

    page = urlopen(req)
    soup = BeautifulSoup(page, 'lxml')
    items = soup.find_all('div', {'class': 'puis-card-container'})
    item_name_list = []
    price_list = []
    image_list = []
    link_list = []
    for i in range(len(items)):
        if items[i].find_all('span', {'class': 'a-price-whole'}):
            price = int(items[i].find('span', {'class': 'a-price-whole'}).text.replace(',', ''))
            price_list.append(price)
            try:
                item_name_list.append(items[i].find_all('span', {'class': 'a-size-base-plus'})[1].text)
            except:
                item_name_list.append(items[i].find_all('span', {'class': 'a-size-base-plus'})[0].text)
            try:
                image_list.append(items[i].find('img', {'class': 's-image s-image-optimized-rendering'}).get('src'))
            except:
                image_list.append('[画像なし]')
            try:
                href = items[i].find('a', {'class': 'a-link-normal'}).get('href')
                link = f'https://amazon.co.jp{href}'
                link_list.append(link)
            except:
                link_list.append('[詳細ページなし]')
        else:
            pass
    all_item_name.extend(item_name_list)
    all_price.extend(price_list)
    all_image.extend(image_list)
    all_link.extend(link_list)
    browser.find_element(By.LINK_TEXT, '次へ').click()
    browser.implicitly_wait(10)
browser.quit()

In [5]:
# Excelへ出力
from datetime import datetime
import pytz
japan_tz = pytz.timezone('Asia/Tokyo')
japan_now = datetime.now(japan_tz)
today = japan_now.strftime('%Y%m%d')

count = np.arange(len(all_image))
img_func = [f'=image(C{c+2})' for c in count]
link_func = [f'=hyperlink(E{c+2})' for c in count]

df =pd.DataFrame(columns=['item', 'price', 'image_url', 'image', 'link_o', 'link'])
df['item'] = all_item_name
df['price'] = all_price
df['image_url'] = all_image
df['image'] = img_func
df['link_o'] = all_link
df['link'] = link_func
df.to_excel(f'{keyword}{today}.xlsx', index = False)