In [1]:
import time
import threading
import queue
import joblib
import random
import datetime
import requests
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from dateutil import parser
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
class Reuters_Crawler(threading.Thread):
    """
    Parameters:
        query: str
        
    Example:
        RC = Reuters_Crawler()
        df = RC.parse_to_dataframe()
    """
    def __init__(self, queue, semaphore, query):
        threading.Thread.__init__(self)
        self.queue = queue
        self.semaphore = semaphore
        self.query = query
        self.driver_path = r"./chromedriver.exe"
        self.next_button = '//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]'
    
    def run(self):
        self.semaphore.acquire()
        self.parse_to_dataframe()
        self.semaphore.release()
    
    def parse_to_dataframe(self):
        """
        Parameters:
            query: str
        """
        # Open driver
        self.url = "https://www.reuters.com/search/news?blob={}&dateRange=all".format(self.query)
        self.driver = webdriver.Chrome(self.driver_path)
        self.driver.get(self.url)
        time.sleep(2)
        # Scroll down page
        self.scroll_to_bottom()
        # Parsing
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.driver.quit()
        news_list = soup.find_all(name="div", attrs={"class": "search-result-content"})
        news_list_generator = self.get_news_list(news_list)
        df = pd.DataFrame(list(news_list_generator), columns=["title", "date", "query", "url"])
        joblib.dump(df, "./data/reuters_news_{}_v1.joblib".format(self.query.lower()), compress=3)
                
    def check_exists_by_xpath(self, xpath):
        try:
            self.driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True

    def scroll_to_bottom(self):

        old_position = 0
        new_position = None

        while new_position != old_position:
            # Get old scroll position
            old_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
            
            # Sleep and Scroll
            time.sleep(3 + random.random())
            self.driver.execute_script((
                    "var scrollingElement = (document.scrollingElement ||"
                    " document.body);scrollingElement.scrollTop ="
                    " scrollingElement.scrollHeight;"))
            
            button = self.driver.find_element_by_xpath(self.next_button)
            try:
                self.driver.execute_script("arguments[0].click()", button)
            except WebDriverException as error:
                print('Click failed...')
                print(error)
            # self.driver.execute_script("arguments[0].click()", button)
            time.sleep(3 + random.random())
            
            # Get new position
            new_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
    
    def get_news_list(self, news_list):
        for i in range(len(news_list)):
            title = news_list[i].find(name="a").text
            date = news_list[i].find(name="h5", attrs={"class": "search-result-timestamp"}).text
            date = parser.parse(date, tzinfos={"EDT": "UTC-8", "EST": "UTC-8"})
            url = news_list[i].find(name="a").get("href")
            url = "https://www.reuters.com" + url
            yield [title, date, self.query, url]

In [None]:
subjects = ["Microsoft", "Apple"]
paper_df = pd.DataFrame()

semaphore = threading.Semaphore(2)
threads = []
my_queue = queue.Queue()

for subject in subjects:
    threads.append(Reuters_Crawler(my_queue, semaphore, subject))
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

In [91]:
df_test = joblib.load("../data/news/reuters_news_microsoft_v1.joblib")
df_test

Unnamed: 0,title,date,query,url
0,Microsoft to adapt its cloud software for heal...,2020-05-19 11:00:00-08:00,Microsoft,https://www.reuters.com/article/idUSL1N2D100I
1,Microsoft to invest $1.5 bln in Italian cloud ...,2020-05-08 10:29:00-08:00,Microsoft,https://www.reuters.com/article/idUSL8N2CQ4PU
2,Microsoft to adapt its cloud software for heal...,2020-05-19 11:29:00-08:00,Microsoft,https://www.reuters.com/article/idUSKBN22V27Z
3,Microsoft to invest $1 bln in Poland - statement,2020-05-05 04:46:00-08:00,Microsoft,https://www.reuters.com/article/idUSW8N2AY00G
4,Microsoft to invest $1 billion in Polish cloud...,2020-05-05 05:19:00-08:00,Microsoft,https://www.reuters.com/article/idUSKBN22H0WP
...,...,...,...,...
905,Why Nokia didn't sell its patents to Microsoft,2013-09-03 19:05:00-08:00,Microsoft,https://www.reuters.com/article/idUSBRE9820ZZ2...
906,Microsoft needs Minecraft to boost mobile ambi...,2014-09-12 16:48:00-08:00,Microsoft,https://www.reuters.com/article/idUSKBN0H72EV2...
907,Microsoft revamps Hotmail as social-friendly O...,2012-07-31 19:36:00-08:00,Microsoft,https://www.reuters.com/article/idUSBRE86U10Z2...
908,Microsoft hires ex-FTC Google expert as lobbyist,2012-03-01 19:09:00-08:00,Microsoft,https://www.reuters.com/article/idUSTRE82100B2...


In [None]:
def get_SP500():
    """
    There are 505 symbols due to several companies with two share classes. For example, 
    Google's parent company Alphabet has Class A (GOOGL) and Class C (GOOG) shares in the index.
    When companies are removed and added to the index the membership list may temporarily show 
    both the removed company and added company.
    """
    url = "https://www.slickcharts.com/sp500"
    res = requests.get(url, timeout=10).text
    soup = BeautifulSoup(res, "html.parser")

    tbody = soup.find(name="tbody")
    for tr in tbody.find_all(name="tr"):
        company = tr.find_all(name="td")[1].find_all(text=True)
        company = "".join(company)
        symbol = tr.find_all(name="td")[2].find_all(text=True)
        symbol = "".join(symbol)
        weight = tr.find_all(name="td")[3].find_all(text=True)
        weight = "".join(weight)
        yield [company, symbol, weight]