# Set_API

In [10]:
from googleapiclient.discovery import build
import datetime
import time
#API_KEY =  'AIzaSyDPHaDiN5kPC23gymzo_J497dVmh5lU65E'  # polybuffer
API_KEY = 'AIzaSyArMFYxvmiHMWK6hmj4qHPAwYf_00ZoGVA'
# API_KEY = 'AIzaSyAZioI1gbrRVHsB8Dylb96npimTouclBB8'#nairen_2
# API_KEY = 'AIzaSyBBjHEU2XTcP1hn9GVEHt_f4I9jMSIFaPs'#nairen_1
# API_KEY = 'AIzaSyArMFYxvmiHMWK6hmj4qHPAwYf_00ZoGVA'#daniel
youtube = build('youtube', 'v3', developerKey=API_KEY)


### 爬取影片資料

In [11]:
def get_video_data(video_id):
    request = youtube.videos().list(
        part='statistics, snippet',
        id=video_id
    )
    response = request.execute()

    if 'items' not in response or len(response['items']) == 0:
        return None  # Return None if no data

    item = response['items'][0]
    statistics = item['statistics']
    snippet = item['snippet']

    # Extracting required data
    title = snippet['title']
    views = int(statistics.get('viewCount', 0))
    likes = int(statistics.get('likeCount', 0))
    comments = int(statistics.get('commentCount', 0))
    publish_date = snippet['publishedAt']
    publish_days = (datetime.datetime.now() - datetime.datetime.fromisoformat(publish_date[:-1])).days

    return {
        'title': title,
        'views': views,
        'likes': likes,
        'comments': comments,
        'publish_days': publish_days,
    }

### 解析video_id

In [12]:
def get_video_ids_from_channel(channel_id, max_results=500):
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        # Fetching at most 50 results at a time
        request = youtube.search().list(
            part='id',
            channelId=channel_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        # Extract video IDs
        for item in response['items']:
            if item['id']['kind'] == 'youtube#video':
                video_ids.append(item['id']['videoId'])

        # Check if there's another page of results
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids[:max_results]

###                                                             批量存取影片資料

In [13]:
def batch_video_data(video_ids):
    data = []
    for i in range(0, len(video_ids), 50):
        batch_ids = video_ids[i:i + 50]
        request = youtube.videos().list(
            part='statistics, snippet',
            id=','.join(batch_ids)
        )
        response = request.execute()

        for item in response['items']:
            snippet = item['snippet']
            statistics = item['statistics']
            publish_date = snippet['publishedAt']
            publish_days = (datetime.datetime.now() - datetime.datetime.fromisoformat(publish_date[:-1])).days
            views = int(statistics.get('viewCount', 0))

            # Filter out videos with 0 views
            if views > 0:
                data.append({
                    'video_id': item['id'],  # 添加 video_id
                    'title': snippet['title'],
                    'views': views,
                    'likes': int(statistics.get('likeCount', 0)),
                    'comments': int(statistics.get('commentCount', 0)),
                    'publish_days': publish_days
                })

        # 在每次处理一批视频后延时
        time.sleep(2.0)  # 延时 2 秒

    return data


**Set_Number of Video**

In [14]:
def get_channel_video_data(channel_id, max_results=500):
    video_ids = get_video_ids_from_channel(channel_id, max_results)
    video_data = batch_video_data(video_ids)
    return video_data

### 訂閱數資料

In [15]:
def get_channel_subscriber_count(channel_id):
    request = youtube.channels().list(
        part='statistics',
        id=channel_id
    )
    response = request.execute()

    if 'items' not in response or len(response['items']) == 0:
        return None

    return int(response['items'][0]['statistics'].get('subscriberCount', 0))

### 由handle來找channel資訊

In [16]:
from googleapiclient.errors import HttpError
def get_channel_id_by_handle(handle):
    try:
        request = youtube.channels().list(
            part='snippet',
            forHandle=handle
        )
        response = request.execute()

        if 'items' in response and len(response['items']) > 0:
            return response['items'][0]['id']
        print("無法找到頻道，請檢查 Handle 是否正確。")
        return None
    except HttpError as e:
        print(f"API 錯誤：{e}")
        return None


In [17]:
from tqdm import tqdm
import os
import pandas as pd


def batch_main(handles, csv_file='youtube_travel_data.csv'):
    all_video_data = []

    # 使用 tqdm 進度條
    for handle in tqdm(handles, desc="處理頻道進度", unit="頻道"):
        print(f"正在處理頻道: {handle}")
        # 獲取頻道 ID
        channel_id = get_channel_id_by_handle(handle)
        if not channel_id:
            print(f"無法找到頻道: {handle}")
            continue

        # 獲取頻道視頻數據
        video_data = get_channel_video_data(channel_id)
        if not video_data:
            print(f"未能獲取 {handle} 的任何視頻數據。")
            continue

        # 獲取訂閱數
        subscribers = get_channel_subscriber_count(channel_id)
        for item in video_data:
            item['subscribers'] = subscribers
        # 將數據追加到總數據中
        all_video_data.extend(video_data)

    # 使用 tqdm 進度條寫入數據
    print("正在保存數據到 CSV 文件...")
    with tqdm(total=len(all_video_data), desc="保存數據進度", unit="影片") as pbar:
        if not os.path.exists(csv_file):
            # 如果文件不存在，創建並寫入表頭
            df = pd.DataFrame(all_video_data)
            df.to_csv(csv_file, index=False, mode='w', header=True)
        else:
            # 如果文件存在，追加數據（避免重複）
            existing_df = pd.read_csv(csv_file)
            new_df = pd.DataFrame(all_video_data)
            combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset='video_id')
            combined_df.to_csv(csv_file, index=False, mode='w', header=True)

        pbar.update(len(all_video_data))

    print("所有數據已成功保存到 CSV 文件：")
    print(pd.read_csv(csv_file))

**Set_Channel**

In [23]:
handles = ['@JapanTravelVlogMrG']  #youtube 主頁網址
batch_main(handles)

處理頻道進度:   0%|          | 0/1 [00:00<?, ?頻道/s]

正在處理頻道: @JapanTravelVlogMrG


處理頻道進度: 100%|██████████| 1/1 [00:07<00:00,  7.66s/頻道]


正在保存數據到 CSV 文件...


保存數據進度: 100%|██████████| 125/125 [00:00<00:00, 7795.18影片/s]

所有數據已成功保存到 CSV 文件：
        video_id                                              title   views  \
0    JyRxnHT4HDY                                          第一次卸妝給大家看   70699   
1    dRSoKZM59nw                               小社畜的地位，這樣決定電最少的永遠是贏家   21630   
2    tCbSd2ceoRQ                                     札幌鮭魚卵蓋飯加到滿的出來！   24985   
3    ufnMZWBKE2E                                   佳里 8 家「在地一級棒美食」！   38122   
4    OZXwLhutpZU                           紀念一下首飛！！經過一個多月的課程終於起飛了🛫️   84847   
..           ...                                                ...     ...   
597  0GtSUxBxqEU  【歐洲旅遊】獨闖德國最大紅燈區・漢堡繩索街Reeperbahn・德國漢堡景點觀光・漢堡市集跳...  757581   
598  CL5rzys5l94  【越南旅遊】2023年越南自由行ep1・胡志明西貢的第一天・夜生活太震驚！ Ho Chi M...   61921   
599  jMdKGrKTjYc  【越南旅遊】只要60美金的會安超贊度假村・會安飯店推薦・會安酒店推薦・會安旅館推薦・2023...   14966   
600  CB7H14MdLHw  【日本旅遊】探訪日本“自殺森林”青木原樹海・真的可怕嗎？ 2023年日本山梨縣富士山週邊自由...   35118   
601  C_Ar1fT49OA  【日本旅遊】2021秋京都嵐山紅葉季｜天龍寺·寶嚴院·渡月橋·嵯峨野·保津峽賞楓 [4K V...    4618   

     likes  comments  publish_da




In [19]:
'''import json
import time 
import os
from googleapiclient.discovery import build

API_KEY = 'AIzaSyAZioI1gbrRVHsB8Dylb96npimTouclBB8'
youtube = build('youtube', 'v3', developerKey=API_KEY)

# 進度儲存檔案
PROGRESS_FILE = 'search_progress.json'

def save_progress(handles, next_page_token):
    """儲存搜尋進度到檔案"""
    progress_data = {
        "handles": sorted(list(handles)),  # 確保順序且無重複
        "nextPageToken": next_page_token
    }
    with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
        json.dump(progress_data, f, ensure_ascii=False, indent=4)

def load_progress():
    """載入搜尋進度"""
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
            progress_data = json.load(f)
            return set(progress_data["handles"]), progress_data["nextPageToken"]
    return set(), None

def get_handles_from_keyword(keyword, max_results=50):
    handles, next_page_token = load_progress()

    while len(handles) < max_results:
        # 搜尋頻道
        search_request = youtube.search().list(
            part='snippet',
            q=keyword,
            type='channel',
            maxResults=50,
            pageToken=next_page_token
        )
        search_response = search_request.execute()

        for item in search_response['items']:
            channel_id = item['snippet']['channelId']

            # 在每次调用频道 API 前加入延时
            time.sleep(3.0)  # 延时 1.5 秒

            # 獲取頻道資訊以提取 handle
            channel_request = youtube.channels().list(
                part='snippet',
                id=channel_id
            )
            channel_response = channel_request.execute()

            for channel in channel_response['items']:
                custom_url = channel['snippet'].get('customUrl', None)
                title = channel['snippet']['title']
                handle = f"@{custom_url}" if custom_url else f"@{title.replace(' ', '').replace('-', '')}"
                handles.add(handle)

        # 儲存進度
        next_page_token = search_response.get('nextPageToken')
        save_progress(handles, next_page_token)

        if not next_page_token:
            break

        # 在每次分页请求之间延时
        time.sleep(3.0)  

    return sorted(list(handles))[:max_results]


# 搜尋台灣相關的頻道
keyword = "台灣"
handles = get_handles_from_keyword(keyword, max_results=10)
print("找到的 Handles:")
batch_main(handles)
'''

'import json\nimport time \nimport os\nfrom googleapiclient.discovery import build\n\nAPI_KEY = \'AIzaSyAZioI1gbrRVHsB8Dylb96npimTouclBB8\'\nyoutube = build(\'youtube\', \'v3\', developerKey=API_KEY)\n\n# 進度儲存檔案\nPROGRESS_FILE = \'search_progress.json\'\n\ndef save_progress(handles, next_page_token):\n    """儲存搜尋進度到檔案"""\n    progress_data = {\n        "handles": sorted(list(handles)),  # 確保順序且無重複\n        "nextPageToken": next_page_token\n    }\n    with open(PROGRESS_FILE, \'w\', encoding=\'utf-8\') as f:\n        json.dump(progress_data, f, ensure_ascii=False, indent=4)\n\ndef load_progress():\n    """載入搜尋進度"""\n    if os.path.exists(PROGRESS_FILE):\n        with open(PROGRESS_FILE, \'r\', encoding=\'utf-8\') as f:\n            progress_data = json.load(f)\n            return set(progress_data["handles"]), progress_data["nextPageToken"]\n    return set(), None\n\ndef get_handles_from_keyword(keyword, max_results=50):\n    handles, next_page_token = load_progress()\n\n    while len(h

# 資料處理

**提取**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
data = pd.read_csv("youtube_travel_data.csv")
features = data[['likes', 'comments', 'publish_days', 'subscribers']]
target = data['views']
weights = {'likes': 1.5, 'comments': 1.0, 'publish_days': 1.0, 'subscribers': 0.8}
for feature, weight in weights.items():
    features.loc[:, feature] = features[feature] * weight
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


  features.loc[:, feature] = features[feature] * weight
  features.loc[:, feature] = features[feature] * weight


Mean Squared Error: 243445620054.7161


**建模**

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# 建立模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 預測與評估（可選，用於確認模型效果）
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"模型已訓練完成，均方誤差 (MSE): {mse}")

模型已訓練完成，均方誤差 (MSE): 243445620054.7161


In [22]:
import pandas as pd
likes = 3792
comments = 185
publish_days = 102
subscribers = 343000
input_data = pd.DataFrame([[likes, comments, publish_days, subscribers]], 
                          columns=['likes', 'comments', 'publish_days', 'subscribers'])

predicted_views = model.predict(input_data)[0]
print(f"預測的 views 為: {predicted_views}")


預測的 views 為: 191905.4
