In [6]:
# !pip3 install -U yfinance

In [7]:
import re
import os
import requests
import pandas as pd
import yfinance as yf
from bs4 import BeautifulSoup as BS
from pandas_datareader import data as pdr
from datetime import datetime, timedelta, timezone

In [8]:
class StockDataGen(object):
    def __init__(self, candidates, days:int=180, save:bool=True):
        self.candidates = candidates
        self.days = days
        self.save = save

        if not os.path.exists('news'):
            os.makedirs('news')
        if not os.path.exists('stock'):
            os.makedirs('stock')

        yf.pdr_override()

    def daily_stock_return(self, df):

        def formula(t, t_1):
            # print(t, t_1)
            return 0 if t_1 == 0 else (t - t_1) / t_1

        df.insert(df.shape[1], "Daily Return", 0)
        # print(type(df), df.shape)

        pre = 0

        for idx, row in df.iterrows():
            curr = row["Close"]
            df.loc[idx, "Daily Return"] = formula(curr, pre)
            pre = curr

        df = df.iloc[1:, :]
        return df

    def get_stock_price(self, query_words):
        # recently X days
        start_date = (datetime.now(tz=timezone.utc) - timedelta(self.days)).strftime("%Y-%m-%d")
        end_date = datetime.now().strftime("%Y-%m-%d")

        data = pdr.get_data_yahoo(query_words, start=start_date, end=end_date)
        data = self.daily_stock_return(data)
        ## for saving
        if self.save:
            data.to_csv(
                "stock/{}_{}_{}.csv".format(query_words.upper(), start_date.replace("-",""), end_date.replace("-",""))
            )
        return data

    def get_news(self, query):
        timestamp = datetime.now().strftime("%Y%m%d")
        print("[{}] start collecting '{}'...".format(timestamp, query))
        datas = list()

        page = 1
        max_page = 1

        ## nd -> days

        while True:
            url = (
                'https://wallmine.com/screener/async?'
                'd=d&'
                'fo=s&'
                'nd={}&'.format(self.days)+
                'o=m&'
                'page={}&'.format(page)+
                'r=n&'
                's={}&'.format(query)+
                'symbols={}'.format(query)
            )
            req = requests.get(url)
            html_text = req.text
            bs = BS(html_text, 'html5lib')
            rows = bs.findAll('tr', {'class': "js-clickable-row clickable-row"})

            print("page: {}".format(page))
            for i in rows:
                data_dict = dict()

                title = i.findAll('a', {'target': "_blank"})[0].getText()
                content = i.findAll('td', {'class': "js-tooltip"})[0]['title']
                small = BS(content, 'html5lib')
                tmp = small.findAll('time', {'class': "timeago"})[0]['title']
                content = re.compile('<small>.*\n<br />\n').sub('', content.strip())

                data_dict = {"datetime":tmp, "title":title, "content":content}
                datas.append(data_dict)

            page += 1
            try:
                # raise the error to make sure break the loop
                bs.findAll('li', {'class': "page-item"})[-1].getText()
            except IndexError as ie:
                break

        df = pd.DataFrame(datas)
        df = df.sort_values(by="datetime", ascending=True)
        if self.save:
            df.to_csv("news/{}_{}.csv".format(query.upper(), timestamp), index=None)
        return df

    def data_gen(self):
        for c in self.candidates:
            self.get_news(c)
            self.get_stock_price(c)


In [9]:
## only support stock code
# candidates = ['ZM']
candidates = [
    'AAL', 'DAL'
]
days = 184
save = True
## instance StockDataGen object
data_gen = StockDataGen(candidates, days, save)
data_gen.data_gen()


[20201203] start collecting 'AAL'...
page: 1
page: 2
page: 3
page: 4
[*********************100%***********************]  1 of 1 completed
[20201203] start collecting 'DAL'...
page: 1
page: 2
page: 3
page: 4
page: 5
[*********************100%***********************]  1 of 1 completed


In [10]:
# ## testing for one
# candidate = "ZM"
# dg = StockDataGen(None, 180, True)
# dg.get_stock_price(candidate)
# dg.get_news(candidate)

In [11]:
print('done!')

done!
