# Workflow:

- Get text from a given stock's FinViz --> Yahoo news page
- Compress this text with NLP summarization algorithms
- Feed this compressed data into ChatGPT for generative abractive summarization 
- For each stock, this will create an equity research report

In [109]:
# Import Parsing Tools
import requests
from bs4 import BeautifulSoup

# Import NLP Tools
import sumy
from sumy.summarizers.luhn  import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

import openai


# Import Utils
import pandas as pd
import numpy as np
import pickle

Function to get an article's main text from a URL

In [350]:
def get_article_text(url: str) -> str:
    """ Generate a string that includes all article content. This has two ways of execution:

        1) If a Yahoo Finance "Continue Reading" gateway link is present.
        2) In the absence of a Yahoo Finance gateway.

        In either scenario, the ultimate goal of extracting all relevant content within a news article is acheived.

    Args:
        url (str):

    Returns:
        str: Article content.
    """


    # Send a GET request to the URL - this is Yahoo Finance gateway to the full article
    yahoo_gateway_response = requests.get(url, headers={'User-Agent': 'Custom'})

    # Check if the request was successful
    try:
        
        # Parse the HTML content of the page
        soup = BeautifulSoup(yahoo_gateway_response.content, 'html.parser')

        # Get full article link from <a class="link caas-button"> tag. This is a Yahoo Finance specific tag that contain the "Continue Reading" hyperlink.
        article_link = soup.find('a', class_ ='link caas-button')['href']
        
        # ------------------------------ Get Full Article Text ------------------------------

        # Send a GET request to the URL - this is the full article that news text will be extracted from
        article = requests.get(article_link, headers={'User-Agent': 'Custom'})

        # Check if the request was successful
        try:

            # Parse the HTML content of the page
            soup = BeautifulSoup(article.content, 'html.parser')

            # Get all <p> tags which contain the news article's text
            article_txt_iter = soup.find_all('p')

            # Initialize str to article's full text
            article_txt = ''

            # Iterate through <p> tags and store all information
            for txt in article_txt_iter:
                article_txt += txt.text.strip()
            
            return article_txt

        except:
            print(f'Reponse Error - Final Article: {article.status_code}')
            print(f'Problem Link: {article_link}')

    except:

        # If the first method of traveling through a Yahooo Finance gateway doesn't work, this normally means the article was directly linked by FinViz. 
        # Therefore, try to extract text directly from the Yahoo link.
        try:
            # ------------------------------ Get Full Article Text - NO Yahoo Gateway ------------------------------

            # Send a GET request to the URL - this is the full article that news text will be extracted from
            article = requests.get(url, headers={'User-Agent': 'Custom'})

            # Check if the request was successful

            # Parse the HTML content of the page
            soup = BeautifulSoup(article.content, 'html.parser')

            # Get all <p> tags which contain the news article's text
            article_txt_iter = soup.find_all('p')

            # Initialize str to article's full text
            article_txt = ''

            # Iterate through <p> tags and store all information
            for txt in article_txt_iter:
                article_txt += txt.text.strip()
            
            return article_txt
        
        except:
            print(f'Reponse Error - Yahoo Gateway: {yahoo_gateway_response.status_code}')
            print(f'Problem Link: {url}')
    
    return ''

Function to get latest news for a stock given the stock's FinViz URL

In [348]:
def get_stock_news(url: str, n_articles = 10) -> np.array:
    """ Pass a given stock's FinViz URL to this function to store all relevent news in a single string. 
        We initially leverage a FinViz URL which then takes us to a series of Yahoo Finance articles. 
        This is to circumvent Yahoo Finance's ads framework, making it easier to immediately get relevant information. 
        
        The workflow proceeds as follows:
        1) Initiate a GET request to the given URL and process the page's content with BeautifulSoup's html.parser.
        2) Iterate through all of FinViz's <div> tags that relate to news links. 
        3) For each relevant <div>, get all news links via each <a> tag's ['href'] value and store in np.array "links".
        4) Iterate through each link in "links" (np.array).
        5) For each link, Yahoo Finance has a "Continue Reading" button which contains the link to the final news article that must be parsed.
        6) Thus, we need to follow that link and extract all text from the body of the final article by iterating through each <p> tag within a particular <div>.
                

    Args:
        url (str): 
        n_articles (int, optional): Number of articles to injest per stock. Defaults to 10.

    Returns:
        _type_: _description_
    """
    # Send a GET request to the URL
    # In this case, the site is filtering on the user agent, it looks like they are blacklisting Python, setting it to almost any other value already works:
    response = requests.get(url, headers={'User-Agent': 'Custom'})
    
    # Check if the request was successful
    if response.status_code == 200:

        # ------------------------ Scrape all news links from a given stock's FinViz page ------------------------ 

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Initialize np.array to store all news links from FinViz page
        links = np.array([])
        
        # Create an iterable object of all <div> tags that contain news links. This is a FinViz specific class. 
        div_iter = soup.find_all('div', class_ ='news-link-container')

        # Iterate through each div and append their respective news links in np.array
        for div in div_iter:
            # <a> tag indicates a link
            links_iter = div.find_all('a') 

            for a in links_iter:
                links = np.append(links, a['href'])

        # ------------------------ Store a select list of article content from scraped URLs ------------------------ 

        # Initialize stock's np.array to store news content
        stock_news = np.array([])

        # Iterate through the all links on the stock's FinViz page
        for i, article in enumerate(links):
            
            # Get article content
            tmp_article_text = get_article_text(article)
            
            # If the text has not already been added (i.e., a unique article has been parsed)
            if np.sum(np.isin(tmp_article_text, stock_news)) == 0:

                print(article)
                
                # Store text
                stock_news = np.append(stock_news, tmp_article_text)
                
                # Stop adding text if we have more than N articles recorded
                if len(stock_news) == n_articles:
                    break

        return stock_news

    else:
        return None

Function to compress and sparsely summarize article content (text) from a np.array of strings 

In [351]:
def compress_articles(news_articles: np.array, n_sentences=5) -> str:
    """ Compress and summarizes a np.array of news article content into a single string. This will be done with the extractive Luhn Summarization algorithm.
        Here, we can specify approximately how many sentences we want in the summary. If the summary is too long (according to ChatGPT3 max tokens), 
        decrease the number of sentences by 1 recursively until the token_size < max_token_size for ChatGPT3.


    Args:
        news_articles (np.array): Array of news article content (strings).
        n_sentences (int, optional): Number of sentences for the Luhn Summarization Algorithm. Defaults to 5.

    Returns:
        str: A given stock's compressed news summary.
    """

    article_summary = "" 
    
    # article_summary = np.array([])

    for article in news_articles:

        parser = PlaintextParser.from_string(article, Tokenizer("english"))

        summarizer = LuhnSummarizer()

        # Summarize using sumy Luhn
        summary = summarizer(parser.document, n_sentences)

        for sentence in summary:
            article_summary += str(sentence)
            article_summary += '\n'
                    
        #article_summary = np.append(article_summary, tmp_article_summary)
    
    # Recursively call compress_articles function until a short-enough response is generated
    if len(article_summary.strip()) > 15000:
        
        return compress_articles(news_articles, n_sentences-1)
            
    print(f'Compression Length {len(article_summary.strip())}')
    
    return article_summary

In [358]:
class portfolio_reports():
    """ Class to generate equity research reports for a given portfolio's constituents. This class will leverage the latest news articles sourced from FinViz to then formulate 
        a compressed summary of significant news. These reports will later be fed into ChatGPT3 for a generative summary and formal equity reserach report.
    """

    def __init__(self, tickers: np.array, n_articles = 10) -> None:
        """

        Args:
            tickers (np.array): Portfolio constituents.
            n_articles (int, optional): Number of articles to injest per stock. Defaults to 10.
        """
        self.tickers = tickers
        self.n_articles = n_articles
        self.portfolio_reports = self.get_portfolio_reports()

    def scrape(self, url: str) -> str:
        """ Generate a string that aggregates a news article's content. This has two ways of execution:

            1) If a Yahoo Finance "Continue Reading" gateway link is present.
            2) In the absence of a Yahoo Finance gateway.

            In either scenario, the ultimate goal of extracting all relevant content within a news article is acheived.

        Args:
            url (str):

        Returns:
            str: Article content.
        """


        # Send a GET request to the URL - this is Yahoo Finance gateway to the full article
        yahoo_gateway_response = requests.get(url, headers={'User-Agent': 'Custom'})

        # Check if the request was successful
        try:
            
            # Parse the HTML content of the page
            soup = BeautifulSoup(yahoo_gateway_response.content, 'html.parser')

            # Get full article link from <a class="link caas-button"> tag. This is a Yahoo Finance specific tag that contain the "Continue Reading" hyperlink.
            article_link = soup.find('a', class_ ='link caas-button')['href']
            
            # ------------------------------ Get Full Article Text ------------------------------

            # Send a GET request to the URL - this is the full article that news text will be extracted from
            article = requests.get(article_link, headers={'User-Agent': 'Custom'})

            # Check if the request was successful
            try:

                # Parse the HTML content of the page
                soup = BeautifulSoup(article.content, 'html.parser')

                # Get all <p> tags which contain the news article's text
                article_txt_iter = soup.find_all('p')

                # Initialize str to article's full text
                article_txt = ''

                # Iterate through <p> tags and store all information
                for txt in article_txt_iter:
                    article_txt += txt.text.strip()
                
                return article_txt

            except:
                print(f'Reponse Error - Final Article: {article.status_code}')
                print(f'Problem Link: {article_link}')

        except:

            # If the first method of traveling through a Yahooo Finance gateway doesn't work, this normally means the article was directly linked by FinViz. 
            # Therefore, try to extract text directly from the Yahoo link.
            try:
                # ------------------------------ Get Full Article Text - NO Yahoo Gateway ------------------------------

                # Send a GET request to the URL - this is the full article that news text will be extracted from
                article = requests.get(url, headers={'User-Agent': 'Custom'})

                # Check if the request was successful

                # Parse the HTML content of the page
                soup = BeautifulSoup(article.content, 'html.parser')

                # Get all <p> tags which contain the news article's text
                article_txt_iter = soup.find_all('p')

                # Initialize str to article's full text
                article_txt = ''

                # Iterate through <p> tags and store all information
                for txt in article_txt_iter:
                    article_txt += txt.text.strip()
                
                return article_txt
            
            except:
                print(f'Reponse Error - Yahoo Gateway: {yahoo_gateway_response.status_code}')
                print(f'Problem Link: {url}')
        
        return ''

    def get_stock_news(self, url: str, n_articles = 10) -> np.array:
        """ Pass a given stock's FinViz URL to this function to store all relevent news in a single string. 
            We initially leverage a FinViz URL which then takes us to a series of Yahoo Finance articles. 
            This is to circumvent Yahoo Finance's ads framework, making it easier to immediately get relevant information. 
            
            The workflow proceeds as follows:
            1) Initiate a GET request to the given URL and process the page's content with BeautifulSoup's html.parser.
            2) Iterate through all of FinViz's <div> tags that relate to news links. 
            3) For each relevant <div>, get all news links via each <a> tag's ['href'] value and store in np.array "links".
            4) Iterate through each link in "links" (np.array).
            5) For each link, Yahoo Finance has a "Continue Reading" button which contains the link to the final news article that must be parsed.
            6) Thus, we need to follow that link and extract all text from the body of the final article by iterating through each <p> tag within a particular <div>.
                    

        Args:
            url (str): 
            n_articles (int, optional): Number of articles to injest per stock. Defaults to 10.

        Returns:
            _type_: _description_
        """
        # Send a GET request to the URL
        # In this case, the site is filtering on the user agent, it looks like they are blacklisting Python, setting it to almost any other value already works:
        response = requests.get(url, headers={'User-Agent': 'Custom'})
        
        # Check if the request was successful
        if response.status_code == 200:

            # ------------------------ Scrape all news links from a given stock's FinViz page ------------------------ 

            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')

            # Initialize np.array to store all news links from FinViz page
            links = np.array([])
            
            # Create an iterable object of all <div> tags that contain news links. This is a FinViz specific class. 
            div_iter = soup.find_all('div', class_ ='news-link-container')

            # Iterate through each div and append their respective news links in np.array
            for div in div_iter:
                # <a> tag indicates a link
                links_iter = div.find_all('a') 

                for a in links_iter:
                    links = np.append(links, a['href'])

            # ------------------------ Store a select list of article content from scraped URLs ------------------------ 

            # Initialize stock's np.array to store news content
            stock_news = np.array([])

            # Iterate through the all links on the stock's FinViz page
            for i, article in enumerate(links):
                
                # Get article content
                tmp_article_text = self.scrape(article)
                
                # If the text has not already been added (i.e., a unique article has been parsed)
                if np.sum(np.isin(tmp_article_text, stock_news)) == 0:

                    print(article)
                    
                    # Store text
                    stock_news = np.append(stock_news, tmp_article_text)
                    
                    # Stop adding text if we have more than N articles recorded
                    if len(stock_news) == n_articles:
                        break

            return stock_news

        else:
            return None


    def compress_articles(self, stock_news: np.array, n_sentences=5) -> str:
        """ Compress and summarizes a np.array of news article content into a single string. This will be done with the extractive Luhn Summarization algorithm.
            Here, we can specify approximately how many sentences we want in the summary. If the summary is too long (according to ChatGPT3 max tokens), 
            decrease the number of sentences by 1 recursively until the token_size < max_token_size for ChatGPT3.


        Args:
            stock_news (np.array): Array of news article content (strings).
            n_sentences (int, optional): Number of sentences for the Luhn Summarization Algorithm. Defaults to 5.

        Returns:
            str: A given stock's compressed news summary.
        """

        article_summary = "" 

        for article in stock_news:

            parser = PlaintextParser.from_string(article, Tokenizer("english"))

            summarizer = LuhnSummarizer()

            # Summarize using sumy Luhn
            summary = summarizer(parser.document, n_sentences)

            for sentence in summary:
                article_summary += str(sentence)
                article_summary += '\n'
                                
        # Recursively call compress_articles function until a short-enough response is generated
        if len(article_summary.strip()) > 15000:
            
            return compress_articles(stock_news, n_sentences-1)
        
        # Print token count of newly compressed news content
        print(f'Compression Length {len(article_summary.strip())}')

        return article_summary



    def get_portfolio_reports(self) -> dict:
        """ Iterate through portfolio holdings, get their respective "self.n_articles" most recent news articles, compress the information via extractive NLP, and store in "portfolio_reports" dict.

        Returns:
            dict: Dictionary {ticker : summary} 
        """

        portfolio_reports = {}

        for stock in self.tickers:
            
            print('=='*100)
            print(f'Getting report for {stock}:')

            prompt = f"You are a Wall Street fundamental stock portfolio manager and researcher who worked at both Citadel and Millennium, the hedge funds. You have a PhD in Mathematics and AI from MIT. Only using the following information, create a predictive stock market research report by extracting key financial/economic/business factors only from the following information to explain every factor that would impact {stock}'s stock price in the future and explain why these factors will dictate the companys price. Make sure there is an introduction and that it is a numerically listed report:"
            url = f'https://finviz.com/quote.ashx?t={stock}&p=d'

            # Store each news article content in np.array for a given stock
            stock_news = self.get_stock_news(url=url, n_articles=self.n_articles)

            # Compress and summarize all news articles into a single string
            prompt += self.compress_articles(stock_news=stock_news)
            
            portfolio_reports[stock] = prompt
        
        return portfolio_reports     

In [359]:
tickers = ['FTAI', 'AMT', 'NEE', 'TDOC', 'INTC', 'FISV', 'DAL', 'ISRG', 'GOOS', 'TXN', 'TSM', 'MHK', 'ACLS', 'EPD', 'PLYM', 'ED']
portfolio_reports = portfolio_reports(tickers).portfolio_reports

Getting report for FTAI:
https://finance.yahoo.com/news/returns-ftai-aviation-nasdaq-ftai-133926735.html
https://finance.yahoo.com/news/ftai-aviation-ltd-ftai-undervalued-121418596.html
https://finance.yahoo.com/news/buy-ftai-aviation-ltd-nasdaq-131835783.html
https://finance.yahoo.com/news/ftai-aviation-unical-aviation-acquire-113000304.html
https://finance.yahoo.com/news/ftai-aviation-ltd-nasdaq-ftai-152935171.html
https://finance.yahoo.com/news/ftai-aviation-ltd-increases-revolver-113000959.html
https://finance.yahoo.com/news/past-ftai-aviation-nasdaq-ftai-151146884.html
https://finance.yahoo.com/news/ftai-aviation-ltd-announces-closing-120000058.html
https://finance.yahoo.com/news/fortress-transportation-infrastructure-investors-llc-132100002.html
https://finance.yahoo.com/news/news-flash-analysts-just-made-104126142.html
Compression Length 11112
Getting report for AMT:
https://finance.yahoo.com/m/5776426e-979b-30eb-9b68-c05ed6b51fc1/the-best-stocks-to-invest.html
https://finance.y

In [149]:
for stock, r in reports.items():
    print('=='*100)
    print(f'{stock} Report:')
    print(r)

FTAI Report:
You are a Wall Street fundamental stock portfolio manager and researcher who worked at both Citadel and Millennium, the hedge funds. You have a PhD in Mathematics and AI from MIT. Only using the following information, create a predictive stock market research report by extracting key financial/economic/business factors only from the following information to explain every factor that would impact FTAI's stock price in the future and explain why these factors will dictate the companys price:So on that note, FTAI Aviation (NASDAQ:FTAI) looks quite promising in regards to its trends of return on capital.For those that aren't sure what ROCE is, it measures the amount of pre-tax profits a company can generate from the capital employed in its business.
Most Read from BloombergAdani’s 413-Page Hindenburg Reply Aims to Calm Before Share SaleRussia Can’t Replace the Energy Market Putin BrokeFed Set to Shrink Rate Hikes Again as Inflation SlowsUkraine Latest: Defense Minister Wants ‘

In [338]:
class PDF(FPDF):

    def __init__(self) -> None:
        super().__init__()        

    def footer(self):
        # Position at 1.5 cm from bottom
        self.set_y(-15)
        # Arial italic 8
        self.set_font('Courier', 'I', 8)
        # Text color in gray
        self.set_text_color(128)
        # Page number
        self.cell(0, 10, 'Page ' + str(self.page_no()), 0, 0, 'C')


In [347]:
from fpdf import FPDF
import datetime as dt

for stock, r in reports.items():
    with open(fr"portfolio_reports_1_31_22\{stock}.txt", "r") as file:

        # Read the contents of the txt file
        report_txt = file.read().encode('latin-1', 'replace').decode('latin-1')

        # save FPDF() class into
        # a variable pdf
        date = dt.date.today()
        title = f'TAMID Equity Research Report {date}: {stock}'
        
        pdf = PDF() # PDF(title=title)
        
        # Add a page
        pdf.add_page()

        # Use Logo for Header
        image_w = 40
        image_h = 30
        pdf.image('TAMID Miami Logo.png',x = 210/2 - image_w/2, y = 10, w = image_w, h = image_h)
        # Line break
        pdf.ln(30)

        # Courier bold 12
        pdf.set_font('Courier', 'B', 12)
        # Calculate width of title and position
        w = pdf.get_string_width(title) + 6
        pdf.set_x((210 - w) / 2)
        # Colors of frame, background and text
        pdf.set_fill_color(255, 255, 255)
        # Title
        pdf.cell(w, 9, title, 'FJ', 1)
        # Line break
        pdf.ln(10)
        
        # set style and size of font
        # that you want in the pdf
        pdf.set_font("Courier", size = 10)        
        
        # Define width height, and justification of each line
        pdf.multi_cell(190, 10, txt = report_txt, align='FJ')
                
        # save the pdf with name .pdf
        pdf.output(fr"portfolio_reports_1_31_22/{stock}_report.pdf")


In [298]:
all_reports = ''
for stock, r in reports.items():
    with open(fr"1_31_22_portfolio_reports\{stock}.txt", "r") as file:

        # Read the contents of the txt file
        report_txt = file.read()
        
        # Create a pdf for the equity research report
        pdf_file = fr'1_31_22_portfolio_reports/{stock}_report.pdf'

        report = canvas.Canvas(pdf_file, pagesize=letter)



        lines = {}
        i = 0
        j = 0

        for word in report_txt.split():
            i += 1

            if i == 6:
                j += 1
            
            try:
                lines[j] = word    
            except:
                lines[j] += word

        size = 11
        y = 790
        
        for i, line in lines.items():
            report.setFont("Helvetica", size)
            report.drawString(25, y, line)
            #y = y-size*1.2
            #size = size+0.5
            y = y - 10
            
        report.save()    

PermissionError: [Errno 13] Permission denied: '1_31_22_portfolio_reports/ACLS_report.pdf'

# Most likely omit the automated ChatGPT3 implementation due to its limited token capacity. 
If token capacity increases (potentially with a paid subscription), then an inline implementation could work smoothly.

In [107]:
final_compression = ''
for i in compressed_articles:
    final_compression += f'\n{i}'

len(final_compression.strip())

9744

Function to recursively attempt to leverage ChatGPT-3

In [108]:
import openai

openai.api_key = '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'

def askGPT(text, max_tokens=3900):

    if len(text.strip()) < 3900:
        
        try:
            reponse = openai.Completion.create(engine = 'text-davinci-003',
                                            prompt = text,
                                            n=1,
                                            max_tokens = max_tokens - len(text.strip()),
                                            temperature = 0.3
                                            )
            
            return reponse
                            
        except:
            return askGPT(text, max_tokens-200)

    else:
        return ''

Create a research report for every compressed article

In [74]:
prompt = "You are a Wall Street fundamental stock portfolio manager and researcher who worked at both Citadel and Millennium, the hedge funds. You have a PhD in Mathematics and AI from MIT. Only using the following information, create a predictive stock market research report by extracting key financial/economic/business factors only from the following information to explain every factor that would impact Royal Caribbean's stock price in the future and explain why these factors will dictate the companys price:"

research_report = ''

for article in compressed_articles:
    response = askGPT(prompt + '\n' + article) # + '\n' + 'Finish the code. Do not print the full code again, just a missing part from last answer.')
    research_report += response['choices'][0]['text']

yes
yes
yes
yes
yes


# Using GPT-2 Transformers for Equity Research Report / Article Compression

The summarization quality : time ratio is not worth summarizing articles with GPT2.
Thus, we should use a much faster compression algorithm to conduct extractive summarization, then pass the aggregated summaries to ChatGPT3 manually. 
The core advantage in passing this to ChatGPT3 is that it will act as an agent in whatever capacity you assign it (i.e., equity researcher at a hedge fund).

In [90]:
# Importing model and tokenizer
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


def article_compression_transformers(article_text):

    # Instantiating the model and tokenizer with gpt-2
    tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
    model=TFGPT2LMHeadModel.from_pretrained('gpt2')

    compressed_articles = np.array([])

    for article in article_text:

        # Encoding text to get input ids & pass them to model.generate()
        inputs=tokenizer.encode(article, return_tensors='tf')
        summary_ids=model.generate( inputs, 
                                    max_length=15000, 
                                    no_repeat_ngram_size=4,
                                    early_stopping=True)

        GPT_summary=tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        print(GPT_summary)

        compressed_articles = np.append(compressed_articles, GPT_summary)

    return compressed_articles

In [99]:
# The summarization quality : time ratio is not worth summarizing articles with GPT2.
# Thus, we should use a much faster compression algorithm to conduct extractive summarization, then pass the aggregated summaries to ChatGPT3 manually. 
# The core advantage in passing this to ChatGPT3 is that it will act as an agent in whatever capacity you assign it (i.e., equity researcher at a hedge fund).

compressed_research_report = article_compression_transformers(article_text=[x])