<a href="https://colab.research.google.com/github/morittistyle/python-basic-kadai/blob/main/python_scraping_kadai004.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install selenium
from datetime import datetime
import re
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By

def extract_stock_data(html):

    soup = BeautifulSoup(html, 'html.parser')
    graph = soup.find('div', class_='highcharts-tooltip')
    graph_td = graph.find_all('td')

    date = datetime.strptime(graph_td[0].text, '%Y/%m/%d').date().strftime('%Y-%m-%d')
    values = [date]
    for value in graph_td[1:]:
        if re.findall('始値|高値|安値|終値', value.text):
            values.append(re.sub(r'\D', '', value.text))

    return values

def get_stock_values(driver, url):

    stock_values = []

    driver.get(url)

    for _ in range(10):
        graph_xy = driver.find_elements(By.CSS_SELECTOR, '.highcharts-grid')[1]
        if graph_xy is not None:
            break
        print('continue find elements')

    graph_width = graph_xy.rect['width']

    actions = ActionChains(driver)
    actions.move_to_element(graph_xy).perform()
    actions.move_by_offset(graph_width // 2, 0).perform()

    html = driver.page_source.encode('utf-8')
    stock_values.append(extract_stock_data(html))

    for _ in range(graph_width - 1):
        actions = ActionChains(driver)
        actions.move_by_offset(-1, 0).perform()
        html = driver.page_source.encode('utf-8')
        temp_value = extract_stock_data(html)
        if temp_value not in stock_values:
            stock_values.append(temp_value)

    return stock_values


if __name__ == '__main__':
    chart_type = '6month'
    url = f'https://www.nikkei.com/markets/worldidx/chart/nk225/?type={chart_type}'

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    chrome_driver = webdriver.Chrome(options=chrome_options)
    start_time = time.time()

    result = get_stock_values(chrome_driver, url)
    print(f'Scraping time: {time.time() - start_time}')

    chrome_driver.quit()

    print('日付, 始値, 高値, 安値, 終値')
    for data in result:
        print(', '.join(data))

Scraping time: 296.567494392395
日付, 始値, 高値, 安値, 終値
2023-12-28, 3347747, 3355388, 3341124, 3352963
2023-12-27, 3353297, 3375575, 3352152, 3368124
2023-12-26, 3329568, 3331226, 3318136, 3330585
2023-12-25, 3341451, 3341451, 3322157, 3325403
2023-12-22, 3325795, 3337519, 331517, 3316905
2023-12-21, 3327681, 3333784, 3309779, 3314047
2023-12-20, 3346732, 3382406, 3346732, 3367594
2023-12-19, 3277421, 3321939, 3265443, 3321939
2023-12-18, 3276923, 3279058, 3254123, 3275898
2023-12-15, 3276056, 3312233, 3273266, 3297055
2023-12-14, 330323, 3312055, 3251504, 3268625
2023-12-13, 3297347, 3310447, 3286438, 3292635
2023-12-12, 3310765, 3317213, 3280024, 328437
2023-12-11, 3266509, 3293308, 326501, 327918
2023-12-08, 3260047, 3260435, 3220538, 3230786
2023-12-07, 3316572, 3319587, 3281469, 3285831
2023-12-06, 3292892, 3345213, 3291409, 334459
2023-12-05, 3302238, 3308982, 3272668, 3277582
2023-12-04, 3331807, 3332438, 3302304, 3323127
2023-12-01, 3353744, 3355157, 3339742, 3343151
2023-11-30, 332