#### 1. Install and Import Baseline Dependencies

In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

  from .autonotebook import tqdm as notebook_tqdm


#### 2. Setup Summarisation Model

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus" # Summarisation Model Downloaded 
tokenizer = PegasusTokenizer.from_pretrained(model_name) # De Coder
model = PegasusForConditionalGeneration.from_pretrained(model_name) # Summarise code ID

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 3. Summarise a Single Article

In [3]:
url = "https://au.finance.yahoo.com/news/china-restricting-tesla-use-uncovers-a-significant-challenge-for-elon-musk-expert-161921664.html"
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p') # Find all paragraphs

In [4]:
paragraphs[0].text

'Renewed political tensions between the U.S. and China — which came to light this week as the Biden administration sat down with their Chinese counterparts for the first time to discuss a range of issues — could ensnarl vehicle maker Tesla (TSLA), which has pushed successfully into China in recent years.'

In [5]:
text = [paragraph.text for paragraph in paragraphs] # Strip the whole article to text format
words = ' '.join(text).split(' ')[:400] #First 400 words, current limit on summarisation model.
ARTICLE = ' '.join(words) # Join the 400 words back to make a shorter article

In [6]:
ARTICLE

'Renewed political tensions between the U.S. and China — which came to light this week as the Biden administration sat down with their Chinese counterparts for the first time to discuss a range of issues — could ensnarl vehicle maker Tesla (TSLA), which has pushed successfully into China in recent years. In fact, the heightened rhetoric between the two super economic superpowers may have already had blowback on Elon Musk\'s electric car company. The Chinese government is restricting the use of Tesla vehicles by military staff and employees of vital state-owned companies, The Wall Street Journal reported Friday. Chinese officials reportedly have concerns that Tesla\'s cars — outfitted with various data collecting capabilities — could serve as leakers of national security secrets. "I have been saying for months now that Tesla\'s level of integration of their business, of their research, of their sort of geo-tracking for which there is integration with Chinese maps, their development of A

In [7]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt') # En coding aricle to tensorflow ID's
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True) # Generate the summary
summary = tokenizer.decode(output[0], skip_special_tokens=True) # De code the summary back to text format

In [8]:
summary

'China restricting use of electric cars by military. Tesla has been successful in China in recent years'

#### 4. Building a News and Sentiment Pipeline

In [9]:
monitored_tickers = ['GME', 'TSLA', 'AAPL']

#### 4.1. Search for Stock News using Google and Yahoo Finance

In [10]:
import re

In [11]:
def search_for_stock_news_urls(ticker):
    search_url = f"https://finance.yahoo.com/quote/{ticker}/news?p={ticker}"
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    r = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]

    # Strip out unwanted URLs
    pattern = re.compile(r'(/news/|/video/|/m/).*\.html')

    # Extract and print the matched URLs with the desired prefix
    filtered_urls = [f'https://finance.yahoo.com{url}' for url in hrefs if pattern.match(url)]
    return filtered_urls

In [12]:
search_for_stock_news_urls('TSLA')

['https://finance.yahoo.com/news/elon-musks-x-sideshow-is-subtly-hurting-tesla-110444713.html',
 'https://finance.yahoo.com/news/dow-trails-sp-500-by-most-since-2000-183654520.html',
 'https://finance.yahoo.com/m/46633595-f02d-3ed2-8dad-1b363e31c1a1/tesla-lower-as-bernstein.html']

#### 4.2. Search and Scrape Cleaned URLs

In [13]:
cleaned_urls = {ticker: search_for_stock_news_urls(ticker) for ticker in monitored_tickers}

In [14]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [15]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ['On December 6th, GameStop Corporation (NYSE: GME) missed revenue expectations but topped earnings estimates with its fiscal third quarter results as it continues to face intense competition from even the e-commerce titan, Amazon.com Inc (NASDAQ: AMZN), and other mass merchants to which it\xa0is losing market share. But once investors digested the results, GameStop shares\xa0shrugged\xa0off early declines. For the quarter ended on October 28th, GameStop reported revenue dropped 9% YoY of\xa0$1.078\xa0billion, coming short of analysts’ estimate of\xa0$1.186 billion. GameStop reported a\xa0net loss of $3.1 million, or 1 cent a share, improving from last year’s comparable quarter when net loss amounted to $94.7 million, or 31 cents a share. On an adjusted-per-share basis, GameStop broke even, which is better than the 8 cent loss that\xa0FactSet\xa0estimated. But, this achievement is mainly the result of\xa0aggressive cost cuts, including European store closures and lowered sellin

#### 4.3. Summarise all Articles

In [16]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [17]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Video game retailer continues to face intense competition. Company announced two corporate investment plan changes',
  'Video game retailer says cost-cutting measures are paying off.',
  'Video-game retailer reported a smaller-than-expected loss.',
  'Gemini is Google’s most advanced artificial intelligence model.',
  'Higher inflationary pressures on consumers’ spending have weighed on the company. Soft sales weighed on the company’s results'],
 'TSLA': ['Musk calls for Disney CEO to be fired. Tesla shares are up more than 20% this year',
  'The Dow is up just shy of 9% this year, while the S&P 500 is up 19%. History suggests investors are over-extended in tech stocks',
  'Shares set to fall as much as 20% on weak demand.'],
 'AAPL': ['We are aware of the issue and are working to resolve it.',
  'The Dow is up just shy of 9% this year, while the S&P 500 is up 19%. History suggests investors are over-extended in tech stocks',
  'Study points to rise in ransomware attacks as g

In [18]:
summaries['TSLA']

['Musk calls for Disney CEO to be fired. Tesla shares are up more than 20% this year',
 'The Dow is up just shy of 9% this year, while the S&P 500 is up 19%. History suggests investors are over-extended in tech stocks',
 'Shares set to fall as much as 20% on weak demand.']

#### 5. Adding Sentiment Analysis

In [19]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [20]:
sentiment(summaries['TSLA'])

[{'label': 'NEGATIVE', 'score': 0.685401439666748},
 {'label': 'NEGATIVE', 'score': 0.9975034594535828},
 {'label': 'NEGATIVE', 'score': 0.9996259212493896}]

In [21]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.8374872207641602},
  {'label': 'NEGATIVE', 'score': 0.8295697569847107},
  {'label': 'NEGATIVE', 'score': 0.999704897403717},
  {'label': 'POSITIVE', 'score': 0.9962210655212402},
  {'label': 'NEGATIVE', 'score': 0.9859585165977478}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.685401439666748},
  {'label': 'NEGATIVE', 'score': 0.9975034594535828},
  {'label': 'NEGATIVE', 'score': 0.9996259212493896}],
 'AAPL': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9975034594535828},
  {'label': 'NEGATIVE', 'score': 0.9773145318031311},
  {'label': 'NEGATIVE', 'score': 0.9287741184234619},
  {'label': 'NEGATIVE', 'score': 0.9976041913032532}]}

In [22]:
print(summaries['AAPL'][4], scores['AAPL'][4]['label'], scores['AAPL'][4]['score'])

Carillon Clarivest Capital Appreciation Fund was most underweight in communication and consumer discretionary services. NEGATIVE 0.9976041913032532


#### 6. Exporting Results to CSV

In [23]:
summaries

{'GME': ['Video game retailer continues to face intense competition. Company announced two corporate investment plan changes',
  'Video game retailer says cost-cutting measures are paying off.',
  'Video-game retailer reported a smaller-than-expected loss.',
  'Gemini is Google’s most advanced artificial intelligence model.',
  'Higher inflationary pressures on consumers’ spending have weighed on the company. Soft sales weighed on the company’s results'],
 'TSLA': ['Musk calls for Disney CEO to be fired. Tesla shares are up more than 20% this year',
  'The Dow is up just shy of 9% this year, while the S&P 500 is up 19%. History suggests investors are over-extended in tech stocks',
  'Shares set to fall as much as 20% on weak demand.'],
 'AAPL': ['We are aware of the issue and are working to resolve it.',
  'The Dow is up just shy of 9% this year, while the S&P 500 is up 19%. History suggests investors are over-extended in tech stocks',
  'Study points to rise in ransomware attacks as g

In [24]:
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.8374872207641602},
  {'label': 'NEGATIVE', 'score': 0.8295697569847107},
  {'label': 'NEGATIVE', 'score': 0.999704897403717},
  {'label': 'POSITIVE', 'score': 0.9962210655212402},
  {'label': 'NEGATIVE', 'score': 0.9859585165977478}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.685401439666748},
  {'label': 'NEGATIVE', 'score': 0.9975034594535828},
  {'label': 'NEGATIVE', 'score': 0.9996259212493896}],
 'AAPL': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9975034594535828},
  {'label': 'NEGATIVE', 'score': 0.9773145318031311},
  {'label': 'NEGATIVE', 'score': 0.9287741184234619},
  {'label': 'NEGATIVE', 'score': 0.9976041913032532}]}

In [25]:
cleaned_url

NameError: name 'cleaned_url' is not defined

In [None]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]

            ]
            output.append(output_this)
    return output

In [None]:
final_output = create_output_array(summaries, scores, cleaned_urls)

In [None]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [None]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting = csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [26]:
import os

cache_directory = os.path.expanduser("~/.cache/huggingface")
cache_directory = os.path.abspath(cache_directory)

print("Hugging Face Transformers Cache Directory:", cache_directory)


Hugging Face Transformers Cache Directory: C:\Users\User\.cache\huggingface
