In [1]:
import time
import threading
import queue
import joblib
import random
import datetime
import requests
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from dateutil import parser
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [11]:
class Reuters_Crawler(threading.Thread):
    
    def __init__(self, queue, semaphore, query, path):
        threading.Thread.__init__(self)
        self.queue = queue
        self.semaphore = semaphore
        self.query = query
        self.path = path
        self.driver_path = r"./chromedriver.exe"
        self.next_button = '//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]'
    
    def run(self):
        self.semaphore.acquire()
        self.parse()
        self.semaphore.release()
    
    def parse(self):
        """
        Parameters:
            query: str
        """
        # Open driver
        self.url = "https://www.reuters.com/search/news?blob={}&dateRange=all".format(self.query)
        self.driver = webdriver.Chrome(self.driver_path)
        self.driver.get(self.url)
        time.sleep(2)
        # Scroll down page
        self.scroll_to_bottom()
        # Parsing
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.driver.quit()
        news_list = soup.find_all(name="div", attrs={"class": "search-result-content"})
        news_list_generator = self.get_news_list(news_list)
        df = pd.DataFrame(list(news_list_generator), columns=["title", "date", "query", "url"])
        joblib.dump(df, self.path, compress=3)
                
    def check_exists_by_xpath(self, xpath):
        try:
            self.driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True

    def scroll_to_bottom(self):

        old_position = 0
        new_position = None

        while new_position != old_position:
            # Get old scroll position
            old_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
            
            # Sleep and Scroll
            time.sleep(2 + random.random())
            self.driver.execute_script((
                    "var scrollingElement = (document.scrollingElement ||"
                    " document.body);scrollingElement.scrollTop ="
                    " scrollingElement.scrollHeight;"))
            
            button = self.driver.find_element_by_xpath(self.next_button)
            try:
                self.driver.execute_script("arguments[0].click()", button)
            except WebDriverException as error:
                print('Click failed...')
                print(error)
            # self.driver.execute_script("arguments[0].click()", button)
            time.sleep(2 + random.random())
            
            # Get new position
            new_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
    
    def get_news_list(self, news_list):
        for i in range(len(news_list)):
            title = news_list[i].find(name="a").text
            date = news_list[i].find(name="h5", attrs={"class": "search-result-timestamp"}).text
            date = parser.parse(date, tzinfos={"EDT": "UTC-8", "EST": "UTC-8"})
            url = news_list[i].find(name="a").get("href")
            url = "https://www.reuters.com" + url
            yield [title, date, self.query, url]

In [18]:
subjects = ["Bristol-Myers Squibb", "Broadcom", "NextEra Energy", "Medtronic", "NIKE", 
            "Texas Instruments", "Oracle", "American Tower", "Linde", "Danaher"]
paper_df = pd.DataFrame()

semaphore = threading.Semaphore(5)
threads = []
my_queue = queue.Queue()

for subject in subjects:
    path = "../data/news/reuters_news_{}_v1.joblib".format(subject.lower())
    threads.append(Reuters_Crawler(my_queue, semaphore, subject, path))
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

In [19]:
df_test = joblib.load("../data/news/reuters_news_alphabet_v1.joblib")
df_test

Unnamed: 0,title,date,query,url
0,Alphabet's legal head David Drummond to retire...,2020-01-10 13:24:00-08:00,Alphabet,https://www.reuters.com/article/idUSL4N29F3B1
1,Alphabet public market capitalization tops $1 ...,2020-01-16 16:53:00-08:00,Alphabet,https://www.reuters.com/article/idUSKBN1ZF2S2
2,Alphabet public market capitalization tops $1 ...,2020-01-16 16:15:00-08:00,Alphabet,https://www.reuters.com/article/idUSL1N29L1RU
3,Google parent Alphabet quarterly revenue beats...,2020-04-28 16:09:00-08:00,Alphabet,https://www.reuters.com/article/idUSL3N2CG4VL
4,Alphabet's Loon launches balloon internet serv...,2020-07-09 09:24:00-08:00,Alphabet,https://www.reuters.com/article/idUSKBN24921G
...,...,...,...,...
3444,US STOCKS-Futures slide amid geopolitical tens...,2019-08-13 07:22:00-08:00,Alphabet,https://www.reuters.com/article/idUSL4N2592FW
3445,Trump re-election campaign targeted by Iran-li...,2019-10-04 19:40:00-08:00,Alphabet,https://www.reuters.com/article/idUSKBN1WJ1ZM
3446,UPDATE 1-Huawei shows off 'most powerful' chip...,2019-09-06 05:11:00-08:00,Alphabet,https://www.reuters.com/article/idUSL5N25X24O
3447,UPDATE 6-Trump re-election campaign targeted b...,2019-10-04 17:28:00-08:00,Alphabet,https://www.reuters.com/article/idUSL3N26P30Z


In [None]:
def get_SP500():
    """
    There are 505 symbols due to several companies with two share classes. For example, 
    Google's parent company Alphabet has Class A (GOOGL) and Class C (GOOG) shares in the index.
    When companies are removed and added to the index the membership list may temporarily show 
    both the removed company and added company.
    """
    url = "https://www.slickcharts.com/sp500"
    res = requests.get(url, timeout=10).text
    soup = BeautifulSoup(res, "html.parser")

    tbody = soup.find(name="tbody")
    for tr in tbody.find_all(name="tr"):
        company = tr.find_all(name="td")[1].find_all(text=True)
        company = "".join(company)
        symbol = tr.find_all(name="td")[2].find_all(text=True)
        symbol = "".join(symbol)
        weight = tr.find_all(name="td")[3].find_all(text=True)
        weight = "".join(weight)
        yield [company, symbol, weight]