In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import sys
import csv
import json
import time
import mysql.connector

from datetime import datetime
from datetime import timedelta

from tqdm.notebook import tqdm


## Sets parameters according to machine ##
def config(machine):
    global path
    global db_user
    global db_host
    global db_password
    
    if(machine == 'mac'):
        path = '/opt/homebrew/bin/chromedriver'
        with open('mac_creds.json') as f:
            data = json.load(f)
            db_host=data['hostW']
            db_user=data['user']
            db_password=data['password']
        
    elif(machine == 'win'):
        path = 'C:/WebDriver/chromedriver'
        db_user='root'
        db_host='localhost'
        with open('win_creds.json') as f:
            data = json.load(f)
            db_password=data['password']
        
    else:
        sys.exit(f"Machine \"{machine}\" not recognized")

class StockVid(object):
    def __init__(self, href, date, views):
        self.post_id = href
        self.date = date
        self.views = views
    
    def jsonEnc(self):
      return {'stock': self.stock, 'date': self.date, 'views': self.views}

def jsonDefEncoder(obj):
    if hasattr(obj, 'jsonEnc'):
        return obj.jsonEnc()
    else: #some default behavior
        return obj.__dict__

    
### DATABASE FUNCTIONS ###

    # returns connection object #
def connect_to_db(db_name):
    cnx = mysql.connector.connect(
    user='root',
    password='chalkHorseMountain',
    host='localhost',
    database=db_name
    )
    return cnx
    
    # returns boolean #
def table_exists(cursor, tbl_name):
    cursor.execute(f"""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = DATABASE()
        AND table_name = "{tbl_name}";
    """)
    
    if cursor.fetchone()[0] == 1:
        return True
    return False
     
def get_date(post):
    today = datetime.today()
    
    try:
        posted = post.find_element_by_xpath(".//*[@id='metadata-line']/span[2]").text
        
        if "just now" in posted:
            return today.strftime('%Y-%m-%d')
    except:
        ## If live, return today
        return today.strftime('%Y-%m-%d')
    
    # Example: vid_time_data = [5, "hours", "ago"]
    vid_time_data = posted.split()
    
    # Remove "Streamed"
    if vid_time_data[0] == "Streamed":
        vid_time_data = vid_time_data[1:]

    if "minute" in posted:
        return (today - timedelta(minutes=int(vid_time_data[0]))).strftime('%Y-%m-%d')
    
    if "hour" in posted:
        return (today - timedelta(hours=int(vid_time_data[0]))).strftime('%Y-%m-%d')
    
    if "day" in posted:
        return (today - timedelta(days=int(vid_time_data[0]))).strftime('%Y-%m-%d')

    if "week" in posted:
        return (today - timedelta(days=7*int(vid_time_data[0]))).strftime('%Y-%m-%d')

    if "month" in posted:
        return (today - timedelta(days=30*int(vid_time_data[0]))).strftime('%Y-%m-%d')

    if "year" in posted:
        return (today - timedelta(days=365*int(vid_time_data[0]))).strftime('%Y-%m-%d')
    
            
    raise Exception(f"Invalid post time-metadata: {vid_time_data}")
    
    
def get_views(post):
    views_data = post.find_element_by_xpath(".//*[@id='metadata-line']/span[1]").text
    views_data = views_data.split()
    views_data = views_data[0]
    
    if views_data[-1] == "K":
        num_views = int(float(views_data[:-1]) * 1000)
        
    elif views_data[-1] == "M":
        num_views = int(float(views_data[:-1]) * 1000000)
    
    elif views_data[-1] == "B":
        num_views = int(float(views_data[:-1]) * 1000000000)
        
    elif views_data == "No":
        num_views = 0
        
    elif views_data == "Scheduled":
        num_views = 0
        
    else:
        num_views = int(views_data)
    
    return num_views
    
    
class youtubeScraper:

    def get_vids(self, stock):
        
        stockTickers = {}
        stockTickers[stock] = {}
        relevantPosts = []
        
        # Use Headless browser
#         options = Options()
#         options.add_argument('--headless')
#         options.add_argument('--disable-gpu')
#         driver = webdriver.Chrome(path, options=options)
        driver = webdriver.Chrome(path)
        driver.get("https://www.youtube.com/results?search_query=" + stock)
    
        ## Begin Scrolling ##
        start = time.perf_counter()
        print("[1/3] Scrolling To Bottom", end=' ')
        while(True):
            try:
                driver.find_element_by_xpath("//*[@id='message'][text()='No more results']")
                break
            except:
                driver.action.sendKeys(PAGE_DOWN)
                continue
             
            break
            
        end = time.perf_counter()
        p1_time = divmod(int(end-start), 60)
        print("[{:02}:{:02}]".format(p1_time[0], p1_time[1]))
        
            
        ## Begin Scraping ##
        
        all_posts = driver.find_elements_by_xpath("//div[@ID='contents']/ytd-video-renderer")
        
        for i in tqdm(range(len(all_posts)), desc = '[2/3] Scraping Vids'):
            post = all_posts[i]
            
            url = post.find_element_by_xpath(".//*[@id='video-title']").get_attribute("href")
            # The last 11 chars of the url is the unique key
            post_id = url[-11:]
            num_views = get_views(post)
            post_date = get_date(post)
            
            stockTickers[stock][i] = StockVid(post_id, post_date, num_views) 
            
        driver.close()
        
        ## Begin Formatting Results ##
        if(len(stockTickers[stock]) > 0):
            for post in stockTickers[stock]:
                relevantPosts.append(stockTickers[stock][post])
        #json_object = json.dumps(relevantPosts, default=jsonDefEncoder, indent = 4)
        #print(json_object)
        
        ## Updating Database ##
        
        cnx = connect_to_db("TheSpatula")
        mycursor = cnx.cursor()
        assert mycursor
        assert table_exists(mycursor, "youtube")
        
        for x in tqdm(range(len(relevantPosts)), desc="[3/3] Updating Database"):
            post = relevantPosts[x]
            
            ## Add post, if it exists already, update post ##
            mycursor.execute(f"""
            INSERT INTO youtube (post_id, symbol, num_views, date_posted) 
            VALUES("{post.post_id}", "{stock}", {post.views}, "{post.date}")
            ON DUPLICATE KEY UPDATE num_views={post.views}
            ;""")
            
            cnx.commit()
            
        
if __name__ == "__main__":
    config("mac")
    
    stocklist = ["TSLA"]
    
    for stock in stocklist:
        youtubeScraper().get_vids(stock)

[1/3] Scrolling To Bottom 

AttributeError: 'WebDriver' object has no attribute 'sendKeys'