In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

import os, time, datetime
from youtube_transcript_api import YouTubeTranscriptApi

pd.set_option("display.max_columns", 20)

%load_ext autoreload
%autoreload 2

### All the functions (Need to put these into a separate .py file)

In [2]:
def get_video_link(search_word, num_scrolls):
    '''
    Input: search query, number of scrolls 
    Output: links for the video results
    '''
    
    ## Uses an automated Chrome browser to do a search on YouTube.com
    chromedriver = "/Applications/chromedriver" ## path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
    
    query = search_word
    youtube_search = "https://www.youtube.com/results?search_query="
    youtube_query = youtube_search + query.replace(' ', '+')
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(youtube_query)
    
    ## Scrolls through the video results page
    for i in range(num_scrolls):
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(1.5)
    
    ## Grabs the URLs of the videos and put them into a list 
    user_data = driver.find_elements_by_xpath('//*[@id="video-title"]')
    links = [link for link in [i.get_attribute('href') for i in user_data] if link]
    
    driver.quit()
    
    return links   

In [3]:
def video_page_scraper(list_links):
    '''
    Input: a list of links 
    Output: video data scraped into a dataframe, each row corresponding to a video
    '''
    chromedriver = "/Applications/chromedriver" ## path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver
    
    driver = webdriver.Chrome(chromedriver)
    wait = WebDriverWait(driver, 10)
    
    ## Create a dataframe containing video data
    df = pd.DataFrame(columns = ['Video ID', 'Title', 'Upload Date', 'Duration (Minutes)', 'Views', 'Number of Likes', 'Description', 'Transcript'])
    
    ## Scrape relevant video data
    for link in list_links:
        driver.get(link)
        time.sleep(2)
        
        ## Video ID
        v_id = wait.until(lambda browser: browser.find_elements_by_xpath("//ytd-watch-flexy[@class='style-scope ytd-page-manager hide-skeleton']")[0].get_attribute('video-id'))
        
        ## Video Title
        v_title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"h1.title yt-formatted-string"))).text
        
        ## Date
        v_date = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"div#date yt-formatted-string"))).text
        
        ## Duration
        v_duration = driver.find_elements_by_xpath("//span[@class='ytp-time-duration']")[0].text
        
        ## Views
        v_views =  wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"div#count yt-view-count-renderer"))).text
        
        ## Number of likes
        v_likes =  wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"div#top-level-buttons yt-formatted-string"))).text
        
        ## Video Description
        v_description =  wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"div#description yt-formatted-string"))).text
        
        ## Transcripts
        try:
            v_transcript = YouTubeTranscriptApi.get_transcript(v_id)
        except:
            v_transcript = np.NaN
        
        df.loc[len(df)] = [v_id, v_title, v_date, v_duration, v_views, v_likes, v_description, v_transcript]

#     soup = BeautifulSoup(driver.page_source, 'html.parser')
#     contents_div = soup.find('div', id='contents')
#     num_videos = len(contents_div.find_all('a', id='video-title'))

    driver.quit()
    
    return df

In [4]:
def merge_video_dataframes(df_videos_raw_str):
    '''
    Input: Name of dataframes containing video data in a string format
    Output: Merged dataframe
    '''
    df_merged = eval(df_videos_raw_str + '_1')
    for i in range(2,16):
        df_merged = pd.concat([df_merged, eval(df_videos_raw_str + '_' + str(i))])
    return df_merged

In [5]:
def df_to_csv(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: CSV file of the dataframe saved into the Data folder
    '''
    eval(df_str).to_csv('../Data/{}.csv'.format(df_str), index=False)

### List all the relevant educational topics in stock investing as search queries (make sure these are not closely related to prevent search results that are too similar)

In [6]:
list_of_search_queries = ['fundamental investing', 'value investing', 'growth investing', 'long term investing', 
                          'stock valuation', 'competitive moats']

### Put all the video links of each search result into a list.  Most search results end after 35+ scrolls so will keep our scrolls to 30.  Comment them out afterward to prevent reproducing different sets of video links (this is necessary as YouTube is dynamic)

In [10]:
# fundamental_investing_links = get_video_link(list_of_search_queries[0], 30)
# value_investing_links = get_video_link(list_of_search_queries[1], 30)
# growth_investing_links = get_video_link(list_of_search_queries[2], 30)
# long_term_investing_links = get_video_link(list_of_search_queries[3], 30)
# stock_valuation_links = get_video_link(list_of_search_queries[4], 30)
competitive_moats_links = get_video_link(list_of_search_queries[5], 3)


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=89.0.4389.82)


In [9]:
competitive_moats_links

NameError: name 'competitive_moats_links' is not defined

### Run the function that scrapes data from each video page.  Save the resulting dataframe into a variable.  Had to cut the list of video links into smaller chunks due to intermittent buffering (need to figure out a fix).  

In [None]:
df_videos_raw6_1 = video_page_scraper(competitive_moats_links[0:20])
df_videos_raw6_2 = video_page_scraper(competitive_moats_links[20:40])
df_videos_raw6_3 = video_page_scraper(competitive_moats_links[40:60])
df_videos_raw6_4 = video_page_scraper(competitive_moats_links[60:80])
df_videos_raw6_5 = video_page_scraper(competitive_moats_links[80:100])
df_videos_raw6_6 = video_page_scraper(competitive_moats_links[100:120])
df_videos_raw6_7 = video_page_scraper(competitive_moats_links[120:140])
df_videos_raw6_8 = video_page_scraper(competitive_moats_links[140:160])
df_videos_raw6_9 = video_page_scraper(competitive_moats_links[160:180])
df_videos_raw6_10 = video_page_scraper(competitive_moats_links[180:200])
df_videos_raw6_11 = video_page_scraper(competitive_moats_links[200:220])
df_videos_raw6_12 = video_page_scraper(competitive_moats_links[220:240])
df_videos_raw6_13 = video_page_scraper(competitive_moats_links[240:260])
df_videos_raw6_14 = video_page_scraper(competitive_moats_links[260:280])
df_videos_raw6_15 = video_page_scraper(competitive_moats_links[280:300])

### Merge the smaller dataframes to create a single dataframe per search query.  

In [None]:
# df_videos_raw_fundamental_investing = merge_video_dataframes('df_videos_raw1')
# df_videos_raw_value_investing = merge_video_dataframes('df_videos_raw2')
# df_videos_raw_growth_investing = merge_video_dataframes('df_videos_raw3')
# df_videos_raw_long_term_investing = merge_video_dataframes('df_videos_raw4')
# df_videos_raw_stock_valuation = merge_video_dataframes('df_videos_raw5')
df_videos_raw_competitive_moats = merge_video_dataframes('df_videos_raw6')

### Save the dataframes into .csv files

In [None]:
# df_to_csv('df_videos_raw_fundamental_investing')
# df_to_csv('df_videos_raw_value_investing')
# df_to_csv('df_videos_raw_growth_investing')
# df_to_csv('df_videos_raw_long_term_investing')
# df_to_csv('df_videos_raw_stock_valuation')
df_to_csv('df_videos_raw_competitive_moats')