In [554]:
from __future__ import print_function

import io
import json
import os
import sys
import time

import argparse
import lxml.html
import requests
from lxml.cssselect import CSSSelector
import datetime
 
now = datetime.datetime.now()

In [555]:
def download_comments(youtube_id, sleep=.1):
    if 'liveStreamability' in requests.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id)).text:
        print('Live stream detected! Not all comments may be downloaded.')
        return download_comments_new_api(youtube_id, sleep)
    return download_comments_old_api(youtube_id, sleep)

In [556]:
def download_comments_new_api(youtube_id, sleep=1):
    # Use the new youtube API to download some comments
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT

    response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))
    html = response.text
    session_token = find_value(html, 'XSRF_TOKEN', 3)

    data = json.loads(find_value(html, 'window["ytInitialData"] = ', 0, '\n').rstrip(';'))
    ncd = next(search_dict(data, 'nextContinuationData'))
    continuations = [(ncd['continuation'], ncd['clickTrackingParams'])]

    while continuations:
        continuation, itct = continuations.pop()
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL_NEW,
                                params={'action_get_comments': 1,
                                        'pbj': 1,
                                        'ctoken': continuation,
                                        'continuation': continuation,
                                        'itct': itct},
                                data={'session_token': session_token},
                                headers={'X-YouTube-Client-Name': '1',
                                         'X-YouTube-Client-Version': '2.20200207.03.01'})

        if not response:
            break
        if list(search_dict(response, 'externalErrorMessage')):
            raise RuntimeError('Error returned from server: ' + next(search_dict(response, 'externalErrorMessage')))

        # Ordering matters. The newest continuations should go first.
        continuations = [(ncd['continuation'], ncd['clickTrackingParams'])
                         for ncd in search_dict(response, 'nextContinuationData')] + continuations

        for comment in search_dict(response, 'commentRenderer'):
            yield {'cid': comment['commentId'],
                   'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
                   'time': comment['publishedTimeText']['runs'][0]['text'],
                   'author': comment.get('authorText', {}).get('simpleText', '')}

        time.sleep(sleep)

In [557]:
def download_comments_old_api(youtube_id, sleep=1):
    # Use the old youtube API to download all comments (does not work for live streams)
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT
    
    # Get Youtube page with initial comments
    response = session.get(YOUTUBE_VIDEO_URL.format(youtube_id=youtube_id))
    html = response.text

    reply_cids = extract_reply_cids(html)

    ret_cids = []
    for comment in extract_comments(html):
        ret_cids.append(comment['cid'])
        yield comment

    page_token = find_value(html, 'data-token')
    session_token = find_value(html, 'XSRF_TOKEN', 3)

    first_iteration = True

    # Get remaining comments (the same as pressing the 'Show more' button)
    while page_token:
        data = {'video_id': youtube_id,
                'session_token': session_token}

        params = {'action_load_comments': 1,
                  'order_by_time': True,
                  'filter': youtube_id}

        if first_iteration:
            params['order_menu'] = True
        else:
            data['page_token'] = page_token

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL_OLD, params, data)
        if not response:
            break

        page_token, html = response.get('page_token', None), response['html_content']
        
        reply_cids += extract_reply_cids(html)
        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment

        first_iteration = False
        time.sleep(sleep)

    # Get replies (the same as pressing the 'View all X replies' link)
    for cid in reply_cids:
        data = {'comment_id': cid,
                'video_id': youtube_id,
                'can_reply': 1,
                'session_token': session_token}

        params = {'action_load_replies': 1,
                  'order_by_time': True,
                  'filter': youtube_id,
                  'tab': 'inbox'}

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL_OLD, params, data)
        if not response:
            break

        html = response['html_content']

        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment
        time.sleep(sleep)

In [558]:
def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)
    sel = CSSSelector('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]

In [559]:
def extract_comments(html):
    tree = lxml.html.fromstring(html)
    item_sel = CSSSelector('.comment-item')
    text_sel = CSSSelector('.comment-text-content')
    time_sel = CSSSelector('.time')
    author_sel = CSSSelector('.user-name')

    for item in item_sel(tree):
        yield {'cid': item.get('data-cid'),
               'text': text_sel(item)[0].text_content(),
               'time': time_sel(item)[0].text_content().strip(),
               'author': author_sel(item)[0].text_content()}

In [560]:
def find_value(html, key, num_chars=2, separator='"'):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find(separator, pos_begin)
    return html[pos_begin: pos_end]

In [561]:
def ajax_request(session, url, params=None, data=None, headers=None, retries=5, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data, headers=headers)
        if response.status_code == 200:
            return response.json()
        if response.status_code in [403, 413]:
            return {}
        else:
            time.sleep(sleep)

In [562]:
YOUTUBE_VIDEO_URL = 'https://www.youtube.com/watch?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL_OLD = 'https://www.youtube.com/comment_ajax'
YOUTUBE_COMMENTS_AJAX_URL_NEW = 'https://www.youtube.com/comment_service_ajax'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'


output = './test.csv'
def comment_main(youtube_id):    
    youtube_id = youtube_id    
    comment_list = []

    limit = 100
    count = 0
    with io.open(output, 'w', encoding='utf8') as fp:
        sys.stdout.write('Downloaded %d comment(s)\r' % count)
        sys.stdout.flush()
        for comment in download_comments(youtube_id):
            comment_json = json.dumps(comment, ensure_ascii=False)
            print(comment_json.decode('utf-8') if isinstance(comment_json, bytes) else comment_json, file=fp)
            count += 1
            comment_list.append(comment_json)
            sys.stdout.write('Downloaded %d comment(s)\r' % count)
            sys.stdout.flush()
            if limit and count >= limit:
                break
    print('\nDone!')
    return comment_list

In [316]:
result = comment_main('xOHrzDykafQ')

Downloaded 100 comment(s)
Done!


In [317]:
result

['{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg", "text": "현재 머슬레인 상황..\\nhttps://www.wadiz.kr/web/campaign/detail/qa/58125", "time": "4일 전", "author": "사망여우TV"}',
 '{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg.98rMN8o1PHD98rMcGdBOm1", "text": "참교육", "time": "4일 전", "author": "안녕."}',
 '{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg.98rMN8o1PHD98rOWMAhWiy", "text": "후기 평가 1.1 클라스 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ", "time": "4일 전", "author": "Danbi 23"}',
 '{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg.98rMN8o1PHD98rPdlporQA", "text": "시즌포는 TEZEWA의 대한민국 공식유통사이며, 머슬레인은 TEZEWA의 설계를 바탕으로 시즌포에서 기획, 디자인, 개발한 독자적인 제품으로 한국시장에 맞춰 개발 및 업그레이드하여\xa0시즌포의 요구기준에 따라 TEZEWA의 중국 생산공장에서 생산되는 OEM제품입니다.\xa0머슬레인과 유사한 제품이 해외 사이트 혹은 직구 등의 경로로 유통될 수 있으나 소재 및 성능, 가격, 구성품, 패키지 등 상당한 차이가 있음을 공유드립니다.\\n\\n이 십새끼들은 독자적이란 말의 뜻을 못 알아처먹나?", "time": "4일 전", "author": "콩구"}',
 '{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg.98rMN8o1PHD98rPr_POA4e", "text": "엄청나네요.", "time": "4일 전", "author": "무서운곰"}',
 '{"cid": "Ugy0KR5xd1kFg4Zo7KV4AaABAg.98rMN8o1PHD98rPywOhTk7", "text": "싫어요는 뭐지", "ti

# 게시글

In [567]:
import requests
from bs4 import BeautifulSoup

keyword = '미르방'

req = requests.get('https://www.youtube.com/results?search_query=%EC%99%80%EB%94%94%EC%A6%88&sp=CAISBAgEEAE%253D')
html = req.text
soup = BeautifulSoup(html, 'html.parser')
my_titles = soup.select(
    'h3 > a'
    )

title = []
url = []

for idx in my_titles:
    if idx.get('href')[:7] != '/watch?':
        pass
    else:
        title.append(idx.text)
        url.append(idx.get('href'))

In [568]:
import pandas as pd

In [569]:
title_list = pd.DataFrame(url, columns = ['url'])
title_list['title'] = title

In [570]:
title_list

Unnamed: 0,url,title
0,/watch?v=sSCuDsrut0E,와디즈 광고_죠죠편
1,/watch?v=F22cAbjUEeE,조금(많이)이상한 와디즈 광고 - 대복이가 bosin탕으로
2,/watch?v=6AwdXvIn9wg,지극히 평범한 와디즈 광고
3,/watch?v=D_Z3MIp84DM,5/23 공간 와디즈 👍🏻
4,/watch?v=Gk6_BFGrM8c,와디즈도 뒤졌답니다!
5,/watch?v=f48ZYEE0wSg,[와디즈 리워드 배송] 누기패드
6,/watch?v=h9MCq-vckro,"와디즈, 디지털 마케팅,리워드,온라인 마케팅, 마소캠퍼스, 스터디파트너"
7,/watch?v=OEAProYzxu8,와디즈 마스크 대복이대왕
8,/watch?v=7CybGyhJxBI,공간 와디즈로 초대합니다!ㅣ플레이 공간 와디즈
9,/watch?v=JlgOg5f-Mzk,대복이가 출근하면서 와디즈에게 반겨줄 밈 (한글자막)


In [571]:
title.get('href')[:7] == '/watch?'

AttributeError: 'list' object has no attribute 'get'

In [516]:
my_titles = soup.select(
    'body'
    )

#video-title

In [517]:
my_titles

[<body class="visibility-logging-enabled ltr exp-invert-logo exp-kevlar-settings exp-mouseover-img exp-responsive exp-search-big-thumbs site-center-aligned site-as-giant-card guide-pinning-enabled appbar-hidden not-nirvana-dogfood delayed-frame-styles-not-in" data-spf-name="other" dir="ltr" id="body">
 <div id="early-body"></div>
 <div id="body-container"><div id="a11y-announcements-container" role="alert"><div id="a11y-announcements-message"></div></div><form action="/logout" method="POST" name="logoutForm"><input name="action_logout" type="hidden" value="1"/></form><div id="masthead-positioner"> <div id="ticker-content">
 </div>
 <div class="clearfix yt-base-gutter" id="yt-masthead-container"> <button class="skip-nav" data-target-id="main" id="a11y-skip-nav" tabindex="3">
 탐색 건너뛰기
   </button>
 <div id="yt-masthead"><div class="yt-masthead-logo-container"> <button aria-controls="appbar-guide-menu" aria-label="가이드" class="yt-uix-button yt-uix-button-size-default yt-uix-button-text yt-

In [None]:
ytd-video-renderer.style-scope:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > ytd-video-meta-block:nth-child(2) > div:nth-child(1) > div:nth-child(1) > ytd-channel-name:nth-child(1) > div:nth-child(1) > div:nth-child(1) > yt-formatted-string:nth-child(1) > a:nth-child(1)