In [1]:
from tempfile import mkdtemp
import codecs

import os
import re
import time
import datetime
import pandas as pd
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.select import Select

import googleapiclient.errors
from googleapiclient.discovery import build


In [36]:
# パラメータ設定
YOUTUBE_API_KEY = "AIzaSyBJUFkD7dTbd5A3PazoBImZotveY4WPJ5Y"
SCAN_LIMIT      = 30
UPDATE          = False
DATABASE_URL    = "https://vtuber-post.com/database.html"

st = "2024-04-01T00:00:00+09:00"
ed = "2024-05-01T00:00:00+09:00"

output_folder        = "..\csv"
filename_livers_info = os.path.join(output_folder,"livers_info.csv")
filename_videos_info = os.path.join(output_folder,"videos_info.csv")

In [37]:
# UPDATE が TRUE or ファイルが無ければファイルを作成
os.makedirs(output_folder, exist_ok=True)

if UPDATE or not os.path.exists(filename_livers_info):
    with open(filename_livers_info, mode='w', encoding='utf8') as f:
        f.write(f"channelId,title,publishedAt,viewCount,subscriberCount,hiddenSubscriberCount,videoCount")

if UPDATE or not os.path.exists(filename_videos_info):
    with open(filename_videos_info, mode='w', encoding='utf8') as f:
        f.write(f"channelId,publishedAt,video_type,title,duration,viewCount,likeCount,favoriteCount,commentCount")

# UPDATE が FALSE なら、既にcsvに乗っているチャンネルIDをリストにまとめる
if not UPDATE:
    df = pd.read_csv(filename_livers_info)
    list_existChannelId = df["channelId"].tolist()
else:
    list_existChannelId = []

In [38]:
# YouTube接続の設定
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

In [39]:
# ブラウザの起動
options = webdriver.ChromeOptions()
options.add_argument("--window-size=1200,1000")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver=driver, timeout=30)

In [40]:
# Vtuberデータベースを開く
driver.get(DATABASE_URL)
wait.until(EC.presence_of_all_elements_located)

<function selenium.webdriver.support.expected_conditions.presence_of_all_elements_located.<locals>._predicate(driver: Union[selenium.webdriver.remote.webdriver.WebDriver, selenium.webdriver.remote.webelement.WebElement])>

In [41]:
# 検索条件を入力

# 所属：所属なし
office = driver.find_element(By.ID, 'office')
select_office = Select(office)
select_office.select_by_index(1)

# 表示件数：100件
limit = driver.find_element(By.ID, 'limit')
select_limit = Select(limit)
select_limit.select_by_index(2)

# 検索条件を追加する
more = driver.find_element(By.XPATH, '//*[@id="cont_search_bottom_wrap"]/dl/div/span')
more.click()
time.sleep(3)

# 国籍：日本
country = driver.find_element(By.ID, 'country')
select_country = Select(country)
select_country.select_by_index(1)

# 上記の条件で絞り込む
search_submit = driver.find_element(By.ID, 'search_submit')
search_submit.click()
time.sleep(3)

In [42]:
# 今回データを集めるchannelIDを取得
list_channelId = []

numScan = 0
flag_scanEnd = False

# ページのループ
page = 0
while 1:
    page += 1

    # ページの切り替え処理
    if page!=1:
        # 適切なページボタンを探すためのループ
        i = 0
        while 1:
            i += 1

            try:
                button = driver.find_element(By.XPATH, f'//*[@id="cont"]/p[1]/a[{i}]')
            # 次のページが無ければループを抜ける
            except NoSuchElementException:
                break

            # ボタンに書かれた数字が次のページ番号ならボタンを押す
            buttonNum = button.text
            if buttonNum == str(page+1):
                button.click()
                wait = WebDriverWait(driver=driver, timeout=30)
                break

    #次のページがなければループを抜ける
    if flag_scanEnd:break

    # ページ内でchannelIDを収集するためのループ
    i = 0
    while 1:
        i += 1

        try:
            youtubeURL = driver.find_element(By.XPATH, f'//*[@id="cont"]/div/div[{i}]/p[1]/a[1]').get_attribute("href")
        except NoSuchElementException:
            break

        channelId  = youtubeURL.replace('https://vtuber-post.com/database_detail.html?id=','')

        # 既にcsvに記載済みのchannelIDで無ければ、リストに加える。(UPDATE=FALSEのとき)
        if channelId not in list_existChannelId:
            list_channelId.append(channelId)
            numScan += 1

        # SCAN数が上限に達したらループを抜ける
        if numScan >= SCAN_LIMIT:
            flag_scanEnd = True
            break

    #SCAN数が上限に達していたらループを抜ける
    if flag_scanEnd:break

In [43]:
#データの取得
for channelId in list_channelId:

    # チャンネルの概要を取得
    response_channel = youtube.channels().list(
        part = "snippet, statistics",
        id   = channelId,
        hl   = "ja_JP",
    ).execute()

    title                 = response_channel["items"][0]["snippet"]["title"]
    publishedAt           = response_channel["items"][0]["snippet"]["publishedAt"]
    viewCount             = response_channel["items"][0]["statistics"]["viewCount"]
    subscriberCount       = response_channel["items"][0]["statistics"]["subscriberCount"]
    hiddenSubscriberCount = response_channel["items"][0]["statistics"]["hiddenSubscriberCount"]
    videoCount            = response_channel["items"][0]["statistics"]["videoCount"]

    with open(filename_livers_info, mode='a', encoding='utf8') as f:
        f.write(f"\n{channelId},'{title}',{publishedAt},{viewCount},{subscriberCount},{hiddenSubscriberCount},{videoCount}")

    # チャンネルのvideoIDをすべて取得
    nextPageToken = ""
    list_videoId = []

    while 1:
        response_search = youtube.search().list(
            part        = "id",
            channelId   = channelId,
            maxResults  = 50,
            publishedAfter  = st,
            publishedBefore = ed,
            type      = "video",
            pageToken = nextPageToken
        ).execute()

        for i in range(0,response_search["pageInfo"]["resultsPerPage"]):
            list_videoId.append(response_search["items"][i]["id"]["videoId"])

        try:
            nextPageToken = response_search["nextPageToken"]
        except KeyError:
            break

    # videoIDから詳細をすべて取得
    i = 0

    for videoId in list_videoId:

        response_videos = youtube.videos().list(
            part = "snippet, statistics, contentDetails, liveStreamingDetails",
            id   = videoId,
            hl   = "ja_JP",
        ).execute()
        
        title         = response_videos["items"][0]["snippet"]["title"]
        publishedAt   = response_videos["items"][0]["snippet"]["publishedAt"]
        duration      = response_videos["items"][0]["contentDetails"]["duration"]
        viewCount     = response_videos["items"][0]["statistics"]["viewCount"]

        # 高評価の数は非公開のときもある
        try:
            likeCount = response_videos["items"][0]["statistics"]["likeCount"]
        except KeyError:
            likeCount = ""
        
        favoriteCount = response_videos["items"][0]["statistics"]["favoriteCount"]
        commentCount  = response_videos["items"][0]["statistics"]["commentCount"]

        # type取得
        if response_videos["items"][0].get('liveStreamingDetails') is not None:
            video_type = "live"
        elif re.search("#shorts", title) is not None:
            video_type = "short"
        else:
            video_type = "movie"

        # ファイル書き込み
        with open(filename_videos_info, mode='a', encoding='utf8') as f:
            f.write(f"\n{channelId},{publishedAt},{video_type},'{title}',{duration},{viewCount},{likeCount},{favoriteCount},{commentCount}")

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2C+statistics%2C+contentDetails%2C+liveStreamingDetails&id=ZKi7ZgSO-LY&hl=ja_JP&key=AIzaSyBJUFkD7dTbd5A3PazoBImZotveY4WPJ5Y&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [33]:
response_videos["items"][0]["statistics"]

{'viewCount': '851', 'favoriteCount': '0', 'commentCount': '5'}

In [102]:
response_videos = youtube.videos().list(
part = "snippet, statistics, liveStreamingDetails, contentDetails",
id   = "0zvDX3OrKb4",
hl   = "ja_JP",
).execute()

In [20]:
len(list_channelId)

100