# Youtube API를 통한 크롤링

# 나경원 류삼영 키워드



In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

import pandas as pd
import numpy as np
from datetime import datetime
import time
import re

import logging



# 로그 설정
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# 콘솔 핸들러 추가
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# 파일 핸들러 추가
file_handler = logging.FileHandler('youtube_scraping.log', mode='a')
file_handler.setLevel(logging.INFO)

# 로그 포맷 설정
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# 핸들러를 로거에 추가
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# 여러 YouTube API 키 설정
api_keys = [
    'AIzaSyBBy5R-f80i3xfXoj3NC3Qi6d9cqmjRUWQ',
    'AIzaSyDJM3LwPtAyzg4Ir479E3E561W5lFm72xs',
    'AIzaSyB3INmnCFrJHIbO54cMZFTPe7vau1vgbtY'
]

current_key_index = 0

def get_youtube_client():
    global current_key_index
    api_key = api_keys[current_key_index]
    youtube = build('youtube', 'v3', developerKey=api_key)
    return youtube

youtube = get_youtube_client()

def switch_api_key():
    global current_key_index, youtube
    current_key_index = (current_key_index + 1) % len(api_keys)
    logger.info(f"Switching to API key: {current_key_index + 1}")
    youtube = get_youtube_client()
    return youtube

# 검색어 및 설정
search_query = "나경원 류삼영"
start_date = datetime.strptime("2024-03-01", "%Y-%m-%d")
end_date = datetime.strptime("2024-04-30", "%Y-%m-%d")



# 불법 문자 제거 함수
def remove_illegal_characters(text):
    illegal_chars = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    return illegal_chars.sub('', text)



def search_videos(query, start_date, end_date):
    video_details = []
    request = youtube.search().list(
        q=query,
        part='id,snippet',
        type='video',
        order='date',
        maxResults=100  # 한 번에 최대 100개 가져오기
    )

    while request:
        try:
            response = request.execute()
        except HttpError as e:
            if e.resp.status in [403, 500, 503]:
                logger.error(f"Quota exceeded or server error: {e}. Switching API key.")
                switch_api_key()
                continue
            else:
                logger.error(f"HttpError occurred: {e}")
                break
        except Exception as e:
            logger.error(f"Error during video search: {e}")
            break

        for item in response['items']:
            video_id = item['id']['videoId']
            title = item['snippet']['title']
            published_at = item['snippet']['publishedAt']
            published_at_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")

            # 날짜 조건 확인
            if published_at_date < start_date:
                return video_details
            if published_at_date > end_date:
                continue

            video_url = f'https://www.youtube.com/watch?v={video_id}'
            video_details.append({
                'video_id': video_id,
                'title': title,
                'published_at': published_at,
                'url': video_url
            })

        logger.info(f"Found {len(video_details)} videos so far...")
        request = youtube.search().list_next(request, response)
        time.sleep(1)  # 요청 간 지연 추가

    return video_details

def get_video_statistics(video_id):
    try:
        request = youtube.videos().list(
            part='statistics',
            id=video_id
        )
        response = request.execute()
        if response['items']:
            stats = response['items'][0]['statistics']
            view_count = stats.get('viewCount', 0)
            like_count = stats.get('likeCount', 0)
            return view_count, like_count
    except HttpError as e:
        if e.resp.status in [403, 500, 503]:
            logger.error(f"Quota exceeded or server error: {e}. Switching API key.")
            switch_api_key()
            return get_video_statistics(video_id)
        else:
            logger.error(f"HttpError occurred while retrieving statistics: {e}")
            return 0, 0
    except Exception as e:
        logger.error(f"Error retrieving statistics for video {video_id}: {e}")
    return 0, 0

def get_video_comments(video_id):
    comments = []

    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=100,  # 한 번에 최대 100개 가져오기
        textFormat='plainText'
    )

    while request:
        try:
            response = request.execute()
        except HttpError as e:
            if e.resp.status in [403, 500, 503]:
                logger.error(f"Quota exceeded or server error: {e}. Switching API key.")
                switch_api_key()
                continue
            else:
                logger.error(f"HttpError occurred: {e}")
                break
        except Exception as e:
            logger.error(f"Error retrieving comments for video {video_id}: {e}")
            break

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comment_data = {
                'comment_id': item['id'],
                'video_id': video_id,
                'parent_id': None,
                'author': comment['authorDisplayName'],
                'text': comment['textDisplay'],
                'like_count': comment['likeCount'],
                'published_at': comment['publishedAt']
            }
            comments.append(comment_data)

            total_reply_count = item['snippet']['totalReplyCount']
            if total_reply_count > 0:
                replies = get_replies(item['id'], video_id)
                comments.extend(replies)

        request = youtube.commentThreads().list_next(request, response)
        time.sleep(1)  # 요청 간 지연 추가

    return comments

def get_replies(parent_id, video_id):
    replies = []
    request = youtube.comments().list(
        part='snippet',
        parentId=parent_id,
        maxResults=100,  # 한 번에 최대 100개 가져오기
        textFormat='plainText'
    )
    while request:
        try:
            response = request.execute()
        except HttpError as e:
            if e.resp.status in [403, 500, 503]:
                logger.error(f"Quota exceeded or server error: {e}. Switching API key.")
                switch_api_key()
                continue
            else:
                logger.error(f"HttpError occurred: {e}")
                break
        except Exception as e:
            logger.error(f"Error retrieving replies for comment {parent_id}: {e}")
            break

        for item in response['items']:
            reply = item['snippet']
            reply_data = {
                'comment_id': item['id'],
                'video_id': video_id,
                'parent_id': parent_id,
                'author': reply['authorDisplayName'],
                'text': reply['textDisplay'],
                'like_count': reply['likeCount'],
                'published_at': reply['publishedAt']
            }
            replies.append(reply_data)

        request = youtube.comments().list_next(request, response)
        time.sleep(1)  # 요청 간 지연 추가

    return replies

# 동영상 검색
start_time = time.time()
video_details = search_videos(search_query, start_date, end_date)
logger.info(f"Total videos found: {len(video_details)}")

# 모든 동영상의 통계 및 댓글 가져오기
all_data = []
total_videos = len(video_details)
for i, video in enumerate(video_details):
    video_start_time = time.time()
    video_id = video['video_id']
    try:
        view_count, like_count = get_video_statistics(video_id)
        comments = get_video_comments(video_id)
        for comment in comments:
            all_data.append({
                'video_id': video_id,
                'title': video['title'],
                'published_at': video['published_at'],
                'view_count': view_count,
                'like_count': like_count,
                'url': video['url'],
                'comment_id': comment['comment_id'],
                'parent_id': comment['parent_id'],
                'author': comment['author'],
                'text': comment['text'],
                'comment_like_count': comment['like_count'],
                'comment_published_at': comment['published_at']
            })
    except Exception as e:
        logger.error(f"Error processing video {video_id}: {e}")
        break

    video_end_time = time.time()
    elapsed_time = video_end_time - video_start_time
    total_elapsed_time = video_end_time - start_time

    # 진행 상황 출력
    logger.info(f"Processed video {i + 1}/{total_videos} ({(i + 1) / total_videos * 100:.2f}%) - {video_id}, comments retrieved: {len(comments)}, time elapsed for this video: {elapsed_time:.2f}s, total elapsed time: {total_elapsed_time:.2f}s")

# 데이터프레임으로 변환
df = pd.DataFrame(all_data)

# 데이터 저장 함수
def save_large_dataframe(df, base_filename, max_rows_per_file=100000):
    try:
        df.to_csv(f'{base_filename}.csv', index=False)
        df.to_excel(f'{base_filename}.xlsx', index=False)
        logger.info(f"Data successfully saved to {base_filename}.csv and {base_filename}.xlsx")
    except Exception as e:
        logger.error(f"Error saving full dataframe: {e}")
        num_parts = int(np.ceil(len(df) / max_rows_per_file))
        logger.info(f"Splitting data into {num_parts} parts due to size limit.")

        for part in range(num_parts):
            start_row = part * max_rows_per_file
            end_row = min((part + 1) * max_rows_per_file, len(df))
            df_part = df.iloc[start_row:end_row]
            part_filename = f'{base_filename}_part{part + 1}'
            try:
                df_part.to_csv(f'{part_filename}.csv', index=False)
                df_part.to_excel(f'{part_filename}.xlsx', index=False)
                logger.info(f"Data part {part + 1} saved to {part_filename}.csv and {part_filename}.xlsx")
            except Exception as e_part:
                logger.error(f"Error saving part {part + 1} of dataframe: {e_part}")

# 데이터 저장 시도
save_large_dataframe(df, 'youtube_videos_with_comments_240301_240430')

logger.info("데이터 저장이 완료되었습니다.")


2024-06-04 08:19:58,588 - INFO - file_cache is only supported with oauth2client<4.0.0
2024-06-04 08:19:59,276 - INFO - Found 49 videos so far...
2024-06-04 08:20:00,769 - INFO - Found 99 videos so far...
2024-06-04 08:20:02,109 - INFO - Found 149 videos so far...
2024-06-04 08:20:03,593 - INFO - Found 199 videos so far...
2024-06-04 08:20:05,082 - INFO - Found 249 videos so far...
2024-06-04 08:20:06,590 - INFO - Found 299 videos so far...
2024-06-04 08:20:07,846 - INFO - Total videos found: 300
2024-06-04 08:20:14,709 - INFO - Processed video 1/300 (0.33%) - XZGyguAz9k0, comments retrieved: 76, time elapsed for this video: 3.72s, total elapsed time: 16.06s
2024-06-04 08:20:17,112 - INFO - Processed video 2/300 (0.67%) - xWd6Ixzd3sw, comments retrieved: 11, time elapsed for this video: 2.40s, total elapsed time: 18.46s
2024-06-04 08:20:23,050 - INFO - Processed video 3/300 (1.00%) - 4MnWXHlzuBY, comments retrieved: 33, time elapsed for this video: 5.94s, total elapsed time: 24.40s
2024

KeyboardInterrupt: 

In [None]:
# 데이터프레임으로 변환
df = pd.DataFrame(all_data)

# CSV 파일로 저장
df.to_csv('youtube_videos_with_comments.csv', index=False)
df.to_excel('youtube_videos_with_comments.xlsx', index=False)

print("데이터가 youtube_videos_with_comments.csv 파일에 저장되었습니다.")

데이터가 youtube_videos_with_comments.csv 파일에 저장되었습니다.


In [None]:


import os
import platform
import time

def play_sound(times=10):
    if platform.system() == 'Windows':
        import winsound
        for _ in range(times):
            winsound.Beep(1000, 500)  # 1kHz sound for 0.5 seconds
            time.sleep(0.1)  # 0.1 second pause between beeps
    elif platform.system() == 'Darwin':  # macOS
        for _ in range(times):
            os.system('afplay /System/Library/Sounds/Ping.aiff')
            time.sleep(0.5)  # 0.5 second pause between sounds
    else:  # Linux
        for _ in range(times):
            os.system('aplay /usr/share/sounds/alsa/Front_Center.wav')
            time.sleep(0.5)  # 0.5 second pause between sounds

play_sound(10)

---