In [77]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import pandas as pd
import json
import urllib.parse
import re
from datetime import datetime

In [None]:
#　ヘッドレスモードでブラウザを起動
options = Options()
options.add_argument('--headless')

#import chromedriver_binary
#chromedriver = webdriver.Chrome()

# chromedriverのバージョンを自動更新
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

driver.get('https://google.com')

In [79]:
# ヤフーファイナンス該当URL（セレニウムでコード入力して自動化できそう）
url = "https://finance.yahoo.co.jp/cm/message/1009984/a5bda5ua5ha5pa5sa5af"

s_code = "9984"

# 保存先
file_path = "./yahoo_data/" + s_code + ".json"
scroll_wait_time = 1

In [80]:
def get_comment(url):
 
    #　ヘッドレスモードでブラウザを起動
    options = Options()
    options.add_argument('--headless')
     
    print(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))
     
    # ブラウザーを起動
    #driver = webdriver.Chrome(chromedriver, options=options)
    #driver = webdriver.Chrome("/usr/local/Caskroom/chromedriver/chromedriver", options=options)
    driver.get(url)

    driver.implicitly_wait(10)  
     
    meta_info = get_meta(driver)
    last_com_number = 99999

    while last_com_number > 1:
         
        com_number = scroll_to_elem(driver)
        time.sleep(scroll_wait_time)
         
        if last_com_number == com_number:
            break
        else:
            last_com_number = com_number
     
    print(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))
     
    # 情報取得
    comment_list = get_list(driver, meta_info)
     
    # ブラウザ停止
    driver.quit()
     
    return comment_list

In [81]:
def get_meta(driver):
 
    title = driver.find_element_by_tag_name("h1").text
    try:
        list = title.split('-')
     
        # 証券コード
        security_code = list[0]
     
        # 日付
        list_2 = title.split('〜')
         
        if len(list_2)==2:
            start_date = extract_date(list_2[0])
            end_date = extract_date(list_2[1])
        else:
            start_date = extract_date(title)
            end_date = ""
     
        info = {}
        info["security_code"] = security_code
        info["start_date"] = start_date
        info["end_date"] = end_date
     
        return info
    except Exception as e:
        return None

In [82]:
def get_list(driver, meta_info):
    # ページリスト
    list = []
    elems = driver.find_elements_by_class_name("comment")
    
    for elem in elems:
        tag = elem.get_attribute("innerHTML")
        comment_id = elem.get_attribute("data-comment")
        info = get_info(comment_id, tag) 
        try:
            info.update(meta_info)
            list.append(info)
        except Exception as e:
            pass
            #print(info)
            #print("not update")
    return list

In [83]:
def scroll_to_elem(driver):
     
    # 最後の要素の一つ前までスクロール
    elems = driver.find_elements_by_class_name("comment")
    last_elem = elems[-1]
     
    # comNum
    com_number = last_elem.find_element_by_class_name("comNum").text
    com_number = int(com_number)
     
    actions = ActionChains(driver);
    actions.move_to_element(last_elem);
    actions.perform();
     
    return com_number

In [84]:
def get_info(comment_id, data):
     
    soup = BeautifulSoup(data, features="lxml")
     
    try:
        comNum = get_text_by_elem(soup.find(class_="comNum"))
        comNum = re.sub("\\D", "", comNum)
         
        # ユーザー
        user_elem = soup.find(class_="comWriter").find("a")
        user_id = user_elem["data-user"]
        user_name = get_text_by_elem(user_elem)
         
        # 感情
        emotion = ""
        emotion_elem = soup.find(class_="comWriter").find(class_="emotionLabel")
        if emotion_elem:
            emotion = get_text_by_elem(emotion_elem)
     
        # 投稿日時
        datetime_elem = soup.find(class_="comWriter").find_all("span")[-1]
        datetime = get_text_by_elem(datetime_elem)
         
        # 返答
        commnet_reply_target = 0
        commnet_reply_dsp = ""
        commnet_reply_elem = soup.find(class_="comReplyTo")
        if commnet_reply_elem:
            commnet_reply_target = commnet_reply_elem.find("a")["data-parent_comment"]
            commnet_reply_dsp = get_text_by_elem(commnet_reply_elem)
         
        # 投稿
        comment_text = get_text_by_elem(soup.find(class_="comText"))
         
        info = {}
        info["comment_id"] = comment_id
        info["comment_number"] = comNum
        info["user_id"] = user_id
        info["user_name"] = user_name
        info["emotion"] = emotion
        info["datetime"] = datetime
        info["commnet_reply_target"] = commnet_reply_target
        info["commnet_reply_dsp"] = commnet_reply_dsp
        info["comment_text"] = comment_text
         
        return info
    except Exception as e:
        return None

In [85]:
def get_text_by_elem(elem):
     
    try:
        text = elem.text
        text = text.strip()  
        return text
    except Exception as e:
        return None

In [86]:
def extract_date(s):
    date_pattern = re.compile('(\d{4})/(\d{1,2})/(\d{1,2})')
    result = date_pattern.search(s)
    if result:
        y, m, d = result.groups()
         
        return str(y) + str(m.zfill(2)) + str(d.zfill(2))
    else:
        return None

In [None]:
if __name__ == '__main__':
     
    result_list = get_comment(url)
    json_data = json.dumps(result_list, ensure_ascii=False)
    with open(file_path, mode='w',encoding="utf-8") as f:
        f.write(json_data)

In [None]:
import pprint
pprint.pprint(json_data)

In [None]:
#end