<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Functions-and-Classes" data-toc-modified-id="Functions-and-Classes-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Functions and Classes</a></span></li><li><span><a href="#System-dependent-Configuration" data-toc-modified-id="System-dependent-Configuration-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>System-dependent Configuration</a></span></li></ul></li><li><span><a href="#Collect-Data" data-toc-modified-id="Collect-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Collect Data</a></span><ul class="toc-item"><li><span><a href="#Collect-Youtube--Data" data-toc-modified-id="Collect-Youtube--Data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Collect Youtube  Data</a></span></li></ul></li><li><span><a href="#Conclusion" data-toc-modified-id="Conclusion-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Conclusion</a></span></li></ul></div>

# Introduction

This playbook has been developed by the Discovery Lab, Applied Intelligence, Accenture Federal Services. @ 2019-2020
<p> This playbook harvests metadata and data from the list of videos returned from a YouTube search. </p>

<p> <b>INPUT:</b> YouTube search term.</p>

<p> <b>OUTPUT</b> is written under data/raw in the format of YOUTUBE_SEARCH_{Scrape_DateTime}_{Search Terms}.csv</p>

# Setup


<p> The imports, function and class defintions, global variables, and system-dependent configuration are in this section. </p>

<p> The system dependent configuration should be carefully reviewed and configured for each system (e.g., Linux vs. Windows, or the path of an external program) since the playbook will most likely fail without proper configuration. </p>

## Imports

In [1]:
"""This cell imports necessary Python modules and performs initial configuration
"""

### Data libraries
import json
import pandas as pd 
import csv

### Visualization and Interaction
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')

from IPython.display import set_matplotlib_formats, display, clear_output, HTML
set_matplotlib_formats('retina')

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import VBox, HBox, Button, HTML, Label

### Computation libraries 
import numpy as np
import re
import random

### Graph analysis
# import networkx as nx
# import community

### System related
# import sys
# import warnings;
# warnings.filterwarnings('ignore')

import io
import platform
from pathlib import Path
import os

# from joblib import Parallel, delayed

### Datetime libraries
from datetime import datetime
import time
from pytz import timezone

### NLP dependencies
# import spacy
# from spacy.tokenizer import Tokenizer
# nlp = spacy.load('en')
# tokenizer = Tokenizer(nlp.vocab)

# from langdetect import detect

### Scraping libraries
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup

### Machine learning libraries
# from sklearn import datasets
# from sklearn import linear_model
# from sklearn.feature_selection import f_regression, mutual_info_regression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report

### Logging
import logging 
logging.basicConfig(level=logging.INFO)

#import spacy
# nlp = spacy.load('en')

## Parameters

In [2]:
"""This cell defines global variables and parameters used throughout the playbook
"""

# Set this to True if you want to watch Selenium scrape pages
WATCH_SCRAPE = True

# Set this to True if you want to use incognito mode
USE_INCOGNITO = True

# The data is written 
RAW_DATA_DIRECTORY = Path("../data/raw/")

# Change based on how many comments you want to scrape
max_num_scrolls = 1

# Number of videos to scrape
num_vids = 2



## Functions and Classes

In [3]:
"""This cell defines functions and classes used throughout the playbook
"""

def get_vids(search_query, driver, num_results):
    youtube_url = 'https://www.youtube.com/results?search_query='
    page = youtube_url + search_query
    try:
        driver.get(page)
        logging.info('Retrieving data from ' + page)
        time.sleep(1)
    except:
        logging.info('Error retrieving data. Try again.')

    # Clear pop up alerts
    try:
        viewpopup = driver.find_element_by_xpath(
            '//ytd-button-renderer[@id="dismiss-button"]/a/paper-button[@id="button"]')
        time.sleep(1)
        viewpopup.click()
        logging.info('Pop up found and cleared')
        time.sleep(1)
    except:
        logging.info('No pop up found :)')

    # Make soup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    html_txt = soup.prettify()

    video_wrappers_max20 = soup.find_all('div', attrs={'id': 'dismissable', 'class': 'style-scope ytd-video-renderer'})
    video_wrappers = video_wrappers_max20[:num_results]
    video_row = []

    for video in video_wrappers:
        # Find original comment
        original = video.find_all('div', attrs={'class': 'text-wrapper style-scope ytd-video-renderer'})
        video_data = {'title': None,
                      'owner': None,
                      'owner_channel': None,
                      'url': None
                      }

        # Get author name
        try:
            for b in original:
                video_data['owner'] = b.find(name="a", attrs={
                    "class": "yt-simple-endpoint style-scope yt-formatted-string"}).text.strip()
        except:
            logging.info('No owner found')

        # Get author channel link
        try:
            for d in original:
                link = d.find(name="a",
                              attrs={"class": "yt-simple-endpoint style-scope yt-formatted-string", "href": True})
                channel = "https://youtube.com" + link['href']
                video_data['owner_channel'] = channel
        except:
            logging.info('No owner channel link found')

        # Get title
        try:
            for c in original:
                video_data['title'] = c.find(name="yt-formatted-string",
                                             attrs={"class": "style-scope ytd-video-renderer"}).text.strip()
        except:
            logging.info('No title found')

        # Get vid url
        try:
            for a in original:
                title_link = a.find(name="a", attrs={"id": "video-title", "href": True})
                url = "https://youtube.com" + title_link['href']
                video_data['url'] = url
        except:
            logging.info('No url found')

        video_row.append(video_data)

    videodf = pd.DataFrame(video_row)
    videodf['vid_index'] = videodf.index + 1
    # print(videodf)
    vid_urls = videodf['url'].tolist()
    return videodf, vid_urls


def get_channel_vids(channel_list, driver, num_results):
    channel_vid_urls = []
    channel_df = []
    for page in channel_list:
        try:
            driver.get(page + '/videos')
            logging.info('Retrieving data from ' + page + '/videos')
            time.sleep(1)
        except:
            logging.info('Error retrieving data. Try again.')

        if num_results > 30:
            logging.info('scrolling down for more videos')
            driver.execute_script('window.scrollBy(0,1500)')
            time.sleep(2)
            if num_results > 60:
                logging.info('scrolling down for more videos')
                driver.execute_script('window.scrollBy(0,1500)')
                time.sleep(2)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        video_wrappers_all = soup.find_all('div',
                                           attrs={'id': 'details', 'class': 'style-scope ytd-grid-video-renderer'})
        video_wrappers = video_wrappers_all[:num_results]
        video_row = []

        if video_wrappers:
            for video in video_wrappers:
                video_data = {'title': None,
                              'url': None
                              }
                # Get title
                try:
                    video_data['title'] = video.find(name="a",
                                                     attrs={"id": "video-title",
                                                            "class": "yt-simple-endpoint style-scope ytd-grid-video-renderer"}).text.strip()
                except:
                    logging.info('No title found')

                # Get vid url
                try:
                    title_link = video.find(name="a", attrs={"id": "video-title", "href": True})
                    url = "https://youtube.com" + title_link['href']
                    video_data['url'] = url
                except:
                    logging.info('No url found')

                video_row.append(video_data)
            videodf = pd.DataFrame(video_row)
            vid_urls = videodf['url'].tolist()
            channel_vid_urls.extend(vid_urls)

        try:
            driver.get(page + '/about')
            logging.info('Retrieving data from ' + page + '/about')
            time.sleep(1)
        except:
            logging.info('Error retrieving data. Try again.')
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # html_txt = soup.prettify()

        channel_name = soup.find(name="yt-formatted-string",
                                 attrs={"id": "text", "class": "style-scope ytd-channel-name"}).text.strip()
        description = soup.find(name="yt-formatted-string", attrs={"id": "description",
                                                                   "class": "style-scope ytd-channel-about-metadata-renderer"}).text.strip()

        stats = \
            soup.find_all('div',
                          attrs={"id": "right-column", 'class': 'style-scope ytd-channel-about-metadata-renderer'})[
                0].text.strip()

        location = soup.find_all('tr', attrs={'class': 'style-scope ytd-channel-about-metadata-renderer'})[
            -1].text.strip()
        links = soup.find_all('a', attrs={"class": "yt-simple-endpoint style-scope ytd-channel-about-metadata-renderer",
                                          "href": True})
        links_url = []
        for i in links:
            links_url.append(i['href'])

        channel_dict = {'channel_name': None,
                        'description': None,
                        'stats': None,
                        'location': None
                        }

        channel_dict['channel_name'] = channel_name
        channel_dict['description'] = description
        channel_dict['stats'] = stats
        channel_dict['location'] = location
        channeldf = pd.DataFrame([channel_dict])
        for i in range(len(links_url)):
            channeldf['link_url_' + str(i)] = links_url[i]
        channel_df.append(channeldf)
    channel_info_df = pd.concat(channel_df, ignore_index=True)

    filename = "YOUTUBE_" + datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + "_CHANNEL" + ".xlsx"
    channel_info_df.to_excel(str(raw_data_dir / filename), index=False, na_rep='None', encoding='UTF-16')

    return channel_info_df, channel_vid_urls


def to_excel_file(df, raw_data_directory, name_str):
    interval = 200000
    df_length = len(df.index)
    if df_length > interval:
        print(np.ceil(df_length / interval))
        array_of_df = np.array_split(df, np.ceil(df_length / interval))
        for idx, df_i in enumerate(array_of_df):
            file_name = "YOUTUBE_" + datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + name_str + ".xlsx"
            df_i.to_excel(str(raw_data_directory / file_name), index=False, na_rep='None', encoding='UTF-16')
    else:
        file_name = "YOUTUBE_" + datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + name_str + ".xlsx"
        df.to_excel(str(raw_data_directory / file_name), index=False, na_rep='None', encoding='UTF-16')


def yt_scrape(url_list, url_shortened, raw_data_directory, driver, max_scrolls):
    scrape_time = datetime.now()
    vid_row = []
    comment_row = []
    for page in url_list:
        # metadata #############################################################
        # <editor-fold desc="Metadata">
        try:
            if url_shortened:
                page = 'https://www.youtube.com' + page
            driver.get(page)
            logging.info('Retrieving data from ' + page)
            time.sleep(1)
        except:
            logging.info('Error retrieving data. Try again.')

        # Clear pop up alerts
        try:
            viewpopup = driver.find_element_by_xpath(
                '//ytd-button-renderer[@id="dismiss-button"]/a/paper-button[@id="button"]')
            time.sleep(1)
            viewpopup.click()
            logging.info('Pop up found and cleared')
            time.sleep(1)
        except:
            logging.info('No pop up found :)')

        # Open transcript
        open_transcript = False
        try:
            time.sleep(1)
            menu = driver.find_element_by_xpath(
                '//div[@id="menu-container"]/div/ytd-menu-renderer/yt-icon-button/button[@id="button"]')
            time.sleep(1)
            menu.click()
            logging.info('Opened menu')
            try:
                opentranscript = driver.find_element_by_xpath(
                    '//ytd-menu-popup-renderer/paper-listbox/ytd-menu-service-item-renderer/paper-item[@class="style-scope ytd-menu-service-item-renderer"]')
                opentranscript.click()
                logging.info('Opened video transcript')
                open_transcript = True
                time.sleep(1)
            except:
                logging.info('No transcript found')
                time.sleep(1)
        except:
            logging.info('Cannot open menu')
            time.sleep(1)

        # Open show more description
        try:
            time.sleep(1)
            description = driver.find_element_by_xpath('//ytd-expander/paper-button[@id="more"]')
            description.click()
            logging.info('Showing more description')
            time.sleep(1)
        except:
            logging.info('Cannot show more description')

        # # Scroll down past comment header
        # driver.execute_script('window.scrollTo(0,500)')
        # time.sleep(1)
        #
        # # Scroll to comment header
        # try:
        #     commentheader = driver.find_element_by_xpath(
        #         '//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]')
        # except:
        #     try:
        #         logging.info('scrolling more to find comment order dropdown')
        #         driver.execute_script('window.scrollBy(0,500)')
        #         time.sleep(1)
        #         commentheader = driver.find_element_by_xpath(
        #             '//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]')
        #     except:
        #         logging.info('scrolling even more to find comment order dropdown')
        #         driver.execute_script('window.scrollBy(0,500)')
        #         time.sleep(1)
        #         commentheader = driver.find_element_by_xpath(
        #             '//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]')
        #
        #
        # action = ActionChains(driver)
        # action.move_to_element(commentheader).perform()
        # time.sleep(1)

        try:
            title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string')
            title.click()
            time.sleep(1)
        except:
            logging.info("no title found?")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # html_txt = soup.prettify()

        vid_metadata = {'url': "Original Comment",
                        'title': None,
                        'scrape_time': None,
                        'datetime_yt': None,
                        'datetime_adj': None,
                        'duration': None,
                        'views': None,
                        'votes': None,
                        'num_comments': None,
                        'owner': None,
                        'owner_channel': None,
                        'owner_subscribers': None,
                        'description': None,
                        'type': None,
                        'transcript': None,
                        }

        # For number formatting
        multipliers = {'K': 1000, 'M': 1000000, 'B': 1000000000}

        # Get URL
        vid_metadata['url'] = page

        # Get title
        try:
            title = soup.find(name="h1",
                              attrs={"class": "title style-scope ytd-video-primary-info-renderer"}).text.strip()
            vid_metadata['title'] = title
        except:
            logging.info('No title found')

        # Get date & set video type
        try:
            vid_metadata['type'] = 'Video'
            date = soup.find(name="div", attrs={"id": "date"}).text.strip()
            date = re.sub(u"\u2022", '', date)
            if "tream" in date:
                vid_metadata['type'] = 'Livestream'
            date = date.strip("Streamed live on ").strip("Started streaming on ").strip("Started streaming ").strip(
                "Streamed live ").strip("Premiered ")
            adj = datetime.strptime(date, '%b %d, %Y')
            adj = adj.strftime("%m/%d/%Y")
            vid_metadata['datetime_yt'] = date
            vid_metadata['datetime_adj'] = adj
        except:
            logging.info('No date found')

        # Get video duration
        try:
            duration = soup.find(name="span", attrs={"class": "ytp-time-duration"}).text.strip()
            duration_formatted = datetime.strptime(duration, '%M:%S').time()
            vid_metadata['duration'] = duration_formatted
        except:
            logging.info('duration is ' + str(duration) + ", trying H:M:S")
            try:
                duration_formatted = datetime.strptime(duration, '%H:%M:%S').time()
                vid_metadata['duration'] = duration_formatted
            except:
                logging.info('duration is ' + str(duration) + ", trying D:H:M:S")
                try:
                    duration_formatted = datetime.strptime(duration, '%D:%H:%M:%S').time()
                    vid_metadata['duration'] = duration_formatted
                except:
                    logging.info('duration is ' + str(duration) + ", unknown format")

        # Get views
        try:
            views = soup.find(name="span",
                              attrs={"class": "view-count style-scope yt-view-count-renderer"}).text.strip()
            views = views.strip(' views').strip(' watching now')
            views = int(views.replace(",", ""))
            vid_metadata['views'] = views
        except:
            logging.info('No views found')

        # Get votes
        try:
            votes = []
            for a in soup.find_all(name="ytd-toggle-button-renderer"):
                vote = a.find(name="yt-formatted-string",
                              attrs={"class": "style-scope ytd-toggle-button-renderer style-text", "aria-label": True})
                values = vote["aria-label"].strip(" dislikes")
                values = int(values.replace(",", ""))
                votes.append(values)
                vid_metadata['votes'] = votes
        except:
            logging.info('No votes found')

        # Get owner/channel
        try:
            owner = soup.find(name="ytd-channel-name", attrs={"id": "channel-name"})
            channel = soup.find(name="a",
                                attrs={"class": "yt-simple-endpoint style-scope yt-formatted-string", "href": True})
            vid_metadata['owner_channel'] = "https://youtube.com" + channel["href"]
            vid_metadata['owner'] = owner.text.replace("\n", "")
        except:
            logging.info('No owner found')

        # Get owner subscriber
        try:
            subscribers = soup.find(name="yt-formatted-string", attrs={"id": "owner-sub-count"}).text.strip()
            subscribers = subscribers.strip(" subscribers")
            if subscribers[-1].isdigit():
                formatted = int(subscribers)
            else:
                mult = multipliers[subscribers[-1]]
                formatted = int(float(subscribers[:-1]) * mult)
            vid_metadata['owner_subscribers'] = formatted
        except:
            logging.info('No owner subscriber count found')

        # Get description
        try:
            description = soup.find(name="yt-formatted-string", attrs={
                "class": "content style-scope ytd-video-secondary-info-renderer"}).text.strip()
            vid_metadata['description'] = description
        except:
            logging.info('No description found')

        # Get transcript
        try:
            if open_transcript:
                transcript = soup.find_all(name="ytd-transcript-body-renderer")[0].text.replace("\n", "")
                vid_metadata['transcript'] = transcript
            else:
                logging.info('No transcript found')
        except:
            logging.info('No transcript found')

        driver.execute_script('window.scrollBy(0,1100)')
        time.sleep(3)
        # Get number of comments
        try:
            # num = driver.find_element_by_xpath(
            #    "/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/ytd-comments/ytd-item-section-renderer/div[1]/ytd-comments-header-renderer/div[1]/h2/yt-formatted-string")
            # num = driver.find_element_by_xpath('//*[@id="count"]/yt-formatted-string')
            # driver.execute_script("arguments[0].scrollIntoView(false);", num)
            # time.sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            num = soup.find(name="h2",
                            attrs={"id": "count", "class": "style-scope ytd-comments-header-renderer"}).text.strip()
            num = num.strip(' Comments')
            num = int(num.replace(",", ""))
            vid_metadata['num_comments'] = num
        except:
            try:
                description_button = driver.find_element_by_xpath('//*[@id="less"]/yt-formatted-string')
                description_button.click()
                driver.execute_script('window.scrollBy(0,-700)')
                time.sleep(2)
            except:
                logging.info("no description expansion found")
            logging.info('No comments found, scrolling up just in case')
            driver.execute_script('window.scrollBy(0,-700)')
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            try:
                num = driver.find_element_by_xpath(
                    '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/ytd-comments/ytd-item-section-renderer/div[1]/ytd-comments-header-renderer/div[1]/h2/yt-formatted-string')
                print("selenium execute script")
                driver.execute_script("arguments[0].scrollIntoView(true);", num)
                time.sleep(1)
                # driver.execute_script("arguments[0].scrollIntoView(false);", num)
                num = soup.find(name="h2",
                                attrs={"id": "count", "class": "style-scope ytd-comments-header-renderer"}).text.strip()
                # num = soup.find_all(name="yt-formatted-string", attrs={"class": "count-text style-scope ytd-comments-header-renderer"}) #.text.strip()
                num = num.strip(' Comments')
                num = int(num.replace(",", ""))
                vid_metadata['num_comments'] = num
            except:
                try:
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    num = soup.find(name="h2",
                                    attrs={"id": "count",
                                           "class": "style-scope ytd-comments-header-renderer"}).text.strip()
                    # num = soup.find_all(name="yt-formatted-string", attrs={"class": "count-text style-scope ytd-comments-header-renderer"}) #.text.strip()
                    # num = driver.find_element_by_xpath('//*[@id="count"]/yt-formatted-string')
                    # driver.execute_script("arguments[0].scrollIntoView(false);", num)
                    time.sleep(1)
                    num = num.strip(' Comments')
                    num = int(num.replace(",", ""))
                    vid_metadata['num_comments'] = num
                except:
                    logging.info('No comments found, scrolling down more just in case FOR REAL')
                    driver.execute_script('window.scrollBy(0,1100)')
                    time.sleep(5)
                    try:
                        logging.info("last attempt to scrape comment count")
                        soup = BeautifulSoup(driver.page_source, 'html.parser')
                        num = soup.find(name="h2",
                                        attrs={"id": "count",
                                               "class": "style-scope ytd-comments-header-renderer"}).text.strip()
                        # num = soup.find_all(name="yt-formatted-string", attrs={"class": "count-text style-scope ytd-comments-header-renderer"}) #.text.strip()
                        # num = driver.find_element_by_xpath('//*[@id="count"]/yt-formatted-string')
                        # driver.execute_script("arguments[0].scrollIntoView(false);", num)
                        time.sleep(1)
                        num = num.strip(' Comments')
                        num = int(num.replace(",", ""))
                        vid_metadata['num_comments'] = num
                    except:
                        logging.info('No comments found')

        vid_metadata['scrape_time'] = scrape_time
        vid_row.append(vid_metadata)
        # </editor-fold>

        # comments #############################################################
        # <editor-fold desc="Comments">
        print('Retrieving comments from ' + str(page))

        # Scroll to past comment header
        driver.execute_script('window.scrollTo(0,500)')
        time.sleep(1)

        # <editor-fold desc="Sort by newest first">
        # # Scroll to "Sort By" drop down
        # sortcomment = driver.find_element_by_xpath(
        #     '//paper-button[@class="dropdown-trigger style-scope yt-dropdown-menu"]')
        # action = ActionChains(driver)
        # action.move_to_element(sortcomment).perform()
        #
        # # Sort comments by newest first
        # try:
        #     sortcomment.click()
        #     time.sleep(1)
        #     newestfirst = driver.find_element_by_xpath(
        #         '//a[@class="yt-simple-endpoint style-scope yt-dropdown-menu"]/paper-item[@class="style-scope yt-dropdown-menu"]')
        #     newestfirst.click()
        #     logging.info(
        #         'Sorted comments by newest first')  # Selected sort view has CSS selector "yt-simple-endpoint style-scope yt-dropdown menu iron-selected" - Default is by Most Relevant
        #     time.sleep(1)
        # except:
        #     driver.execute_script('window.scrollTo(0,{0})').format(scroll_down * 100)
        #     logging.info('Cannot sort comments')
        #     time.sleep(1)
        # </editor-fold>

        # Scroll down page
        scroll_down = 1
        while scroll_down <= max_scrolls:  # Max scrolls defined in System-Dependent Configurations
            driver.execute_script("window.scrollTo(0,{0})".format(scroll_down * 100000))
            scroll_down += 1
            time.sleep(5)

        replies_div = driver.find_elements_by_xpath(
            '//ytd-button-renderer[@id="more-replies"]/a/paper-button[@id="button"]')
        morereplies_div = driver.find_elements_by_xpath(
            '//div[@id="expander-contents"]/div/yt-next-continuation/paper-button[@role="button"]')

        for reply in replies_div:
            try:
                driver.execute_script("arguments[0].scrollIntoView(false);", reply)
                reply.click()
                logging.info("Replies found and clicked")
                time.sleep(2)
            except:
                logging.info("Replies not found, trying again")
                try:
                    driver.execute_script("arguments[0].scrollIntoView(true);", reply)
                    reply.click()
                    logging.info("Replies found and clicked")
                    time.sleep(2)
                except:
                    logging.info("Replies not found")

        for more in morereplies_div:
            try:
                driver.execute_script("arguments[0].scrollIntoView(false);", more)
                more.click()
                logging.info("More replies found and clicked")
                time.sleep(1)
            except:
                logging.info("More replies not found")

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        comment_wrappers = soup.find_all('ytd-comment-thread-renderer',
                                         attrs={'class': 'style-scope ytd-item-section-renderer'})
        # comment_wrappers2 = soup.select('#contents > ytd-comment-thread-renderer')
        for comment in comment_wrappers:
            # Find original comment
            original = comment.find_all(name='ytd-comment-renderer', attrs={'id': 'comment'})
            comment_data = {'type': "Original Comment",
                            'scrape_time': None,
                            'datetime_yt': None,
                            'datetime_adj': None,
                            'author': None,
                            'author_channel': None,
                            'text': None,
                            'comment_votes': None,
                            'url': page
                            }
            # Get time of post
            try:
                for a in original:
                    dt = a.find(name="a",
                                attrs={"class": "yt-simple-endpoint style-scope yt-formatted-string"}).text.strip()
                    comment_data['datetime_yt'] = dt
                    comment_data['datetime_adj'] = dateparser.parse(dt.rstrip("(edited)")).strftime("%m/%d/%Y")
            except:
                logging.info('No datetime found')

            # Get author name
            try:
                for b in original:
                    comment_data['author'] = b.find(name="a", attrs={"id": "author-text"}).text.strip()
            except:
                logging.info('No author found')

            # Get author channel link
            try:
                for d in original:
                    link = d.find(name="a", attrs={"id": "author-text", "href": True})
                    channel = "https://youtube.com" + link['href']
                    comment_data['author_channel'] = channel
            except:
                logging.info('No author channel link found')

            # Get comment
            try:
                for c in original:
                    comment_data['text'] = c.find(name="yt-formatted-string",
                                                  attrs={"id": "content-text"}).text.strip()
            except:
                logging.info('No comment found')

            # Get votes
            try:
                for e in original:
                    # votes = e.find_all("span", attrs={"class": "style-scope ytd-comment-action-buttons-renderer", "id": "vote-count-middle"})
                    votes = e.find(name="span", attrs={"id": "vote-count-middle"}).text.strip()
                    comment_data['comment_votes'] = votes
            except:
                logging.info('No votes found')

            comment_data['scrape_time'] = scrape_time
            comment_df = pd.DataFrame([comment_data])

            # Find replies
            # reply = driver.find_element_by_xpath('//*[@id="loaded-replies"]/ytd-comment-renderer[1]')
            # reply = comment.find_all()
            reply = comment.find_all('ytd-comment-renderer',
                                     attrs={'class': 'style-scope ytd-comment-replies-renderer'})
            for post in reply:
                reply_data = {'type': "Reply",
                              'scrape_time': None,
                              'datetime_yt': None,
                              'datetime_adj': None,
                              'author': None,
                              'author_channel': None,
                              'text': None,
                              'comment_votes': None,
                              'url': page
                              }

                # Get time of post
                try:
                    dt = post.find(name="a", attrs={
                        "class": "yt-simple-endpoint style-scope yt-formatted-string"}).text.strip()
                    reply_data['datetime_yt'] = dt
                    reply_data['datetime_adj'] = dateparser.parse(dt.rstrip("(edited)")).strftime("%m/%d/%Y")
                except:
                    logging.info('No datetime found')

                # Get author
                try:
                    reply_data['author'] = post.find(name="a", attrs={"id": "author-text"}).text.strip()
                except:
                    logging.info('No author found')

                # Get author channel link
                try:
                    link = post.find(name="a", attrs={"id": "author-text", "href": True})
                    channel = "https://youtube.com" + link['href']
                    reply_data['author_channel'] = channel
                except:
                    logging.info('No author channel link found')

                # Get comment
                try:
                    reply_data['text'] = post.find(name="yt-formatted-string",
                                                   attrs={"id": "content-text"}).text.strip()
                except:
                    logging.info('No comment found')

                # Get votes
                try:
                    votes = post.find(name="span", attrs={"id": "vote-count-middle"}).text.strip()
                    reply_data['comment_votes'] = int(votes)
                except:
                    logging.info('No votes found')

                reply_df = pd.DataFrame([reply_data])
                reply_df['scrape_time'] = scrape_time
                

                comment_df = pd.concat([comment_df, reply_df], ignore_index=True)

            comment_row.append(comment_df)
        # </editor-fold>

    metadf = pd.DataFrame.from_dict(vid_row)
    print(metadf.head())
    df_type = '_METADATA'
    # metadf.to_excel(str(raw_data_directory / file_name), index=False, na_rep='None', encoding='UTF-16')
    to_excel_file(metadf, raw_data_directory, df_type)

    commentdf = pd.concat(comment_row, ignore_index=True)
    commentdf['comment_index'] = commentdf.index + 1
    print(commentdf.head())
    df_type = '_VIDEO'
    # commentdf.to_excel(str(raw_data_directory / file_name), index=False, na_rep='None', encoding='UTF-16')
    to_excel_file(commentdf, raw_data_directory, df_type)


def divide_list(l, n):
    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]


def yt_scrape_lots_of_videos(url_list, url_shortened):
    # How many elements each list should have
    num_urls = 2000
    total_urls = len(url_list)
    x = list(divide_list(url_list, num_urls))
    for idx, url_sublist in enumerate(x):
        yt_scrape(url_sublist, url_shortened, raw_data_dir, web_driver, max_num_scrolls)
        print('num of urls is', str(total_urls), ' and finished round ', idx)


def strip_tag(list_obj):
    ls = [x.strip('/watch?v=') for x in list_obj]
    return ls

## System-dependent Configuration

In [4]:
"""This cell defines system-dependent configuration such as those different in Linux vs. Windows
"""


# System dependent configuration
os.getcwd()
PLATFORM_SYSTEM = platform.system()

if PLATFORM_SYSTEM == "Darwin":
    # EXECUTABLE_PATH = Path("../dependencies/chromedriver")
    EXECUTABLE_PATH = Path(
        "../dependencies/chromedriver")
elif PLATFORM_SYSTEM == "Windows":
    EXECUTABLE_PATH = Path("~/../dependencies")
    file = EXECUTABLE_PATH / "chromedriver.exe"
else:
    logging.critical("System not supported...")
    exit()

# Collect Data

## Collect Youtube  Data

In [5]:
"""This cell retrieves page posts and comments, for a given page.
"""

if __name__ == '__main__':
##########################################
# step 0: set up
    # Global variables and parametrs used throughout the notebook
    # Make true if you want to watch scrape

    search_term = input("Enter the YouTube search term: ")
    
    # Update when running
    raw_data_dir = Path(
        "../data/raw")

    # Create the driver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--incognito')
    chrome_options.add_argument("--mute-audio")
    if not WATCH_SCRAPE:
        chrome_options.add_argument('--headless')
        chrome_options.add_argument("--window-size=1440, 900")

    try:
        web_driver = webdriver.Chrome(options=chrome_options, executable_path=EXECUTABLE_PATH)
        logging.info('Chrome launched.')
    except:
        logging.critical(
            'Chrome could not be launched. Check executable path and if Chromedriver supports the version of the browser.')

##########################################
# step 1: generate the list of urls for the videos that you want to scrape. we'll call this list urls
# you can either pass it directly

    # urls = [
    #     "https://www.youtube.com/watch?v=poRAut99bRk"
    #     , "https://www.youtube.com/watch?v=A9mny08uI8s"
    #     , "https://www.youtube.com/watch?v=AWyJ8vJQ6Vo"
    #     , "https://www.youtube.com/watch?v=UpIcfw7Q14w"
    #     , "https://www.youtube.com/watch?v=HgaHpLBGTRY"
    #     , "https://www.youtube.com/watch?v=PAY56ovT4Qc"
    #     , "https://www.youtube.com/watch?v=VQl2KVISh1w"
    # ]

# or read it from an excel spreadsheet 
    # smaller_urls = pd.read_excel(
    #     '/Users/jennifer.jin/OneDrive - Accenture Federal Services/Downloads/social_media/ES/filtered_df_orig_1293.xlsx')[
    #     'url'].to_list()
    # smaller_urls = strip_tag(smaller_urls)
    # bigger_urls = pd.read_excel(
    #     '/Users/jennifer.jin/OneDrive - Accenture Federal Services/Downloads/social_media/ES/filtered_df2.xlsx')[
    #     'url'].to_list()
    # urls = np.setdiff1d(bigger_urls, smaller_urls)

    # urls = pd.read_excel(
    #     '/Users/jennifer.jin/OneDrive - Accenture Federal Services/Downloads/social_media/ES/filtered_df.xlsx')[
    #     'url'].to_list()

    # urls_df = pd.read_excel(str(raw_data_dir / "YOUTUBE_2020-08-06T16-29-59_METADATA.xlsx"))
    # urls_rescrape = urls_df[(urls_df.num_comments == 'None') & (urls_df.title != 'None')]
    # urls = urls_rescrape['url'].to_list()
    # yt_scrape_lots_of_videos(urls, False)

    # or get it from the search results for a search query
    
    searchresultsdf, urls = get_vids(search_term, web_driver, num_vids)

# or get it from the uploaded videos from a list of channels 
    # channels = ["https://www.youtube.com/channel/UCMCgOm8GZkHp8zJ6l7_hIuA"
    # 'https://youtube.com/channel/UC0Wf8S7q9hDz8LbBDLVkiYw'
    # , 'https://youtube.com/channel/UCPOYW7dOo_mqPP6-HkjJ2ng'
    # , 'https://youtube.com/channel/UChGPLteUhl8SdM4ZuwZvyhQ'
    # , 'https://youtube.com/channel/UCFdKC2cpE7mkBe2VAndcjqA'
    # , 'https://youtube.com/user/Ironclaw007'
    # , 'https://youtube.com/channel/UCp3qQc5YSUN-FbN3_qhZX1Q'
    #         ]
    # channel_df, urls = get_channel_vids(channels, web_driver, num_vids)

##########################################
# step 2: scrape the actual video metadata, transcript, comments, replies, etc from the list of urls
    yt_scrape(urls, False, raw_data_dir, web_driver, max_num_scrolls)

Enter the YouTube search term: Trump


CRITICAL:root:Chrome could not be launched. Check executable path and if Chromedriver supports the version of the browser.


NameError: name 'web_driver' is not defined

# Conclusion

In [None]:
"""Add post-processing steps here
"""

# Clean up the environment
web_driver.quit()