# 1. Install and Import Baseline Dependencies

In [1]:
!pip install transformers
!pip install sentencepiece



In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

# 2. Setup Summarization Model

In [5]:
tokenizer = PegasusTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

model = AutoModelForSeq2SeqLM.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

# 3. Summarize a Single Article

In [28]:
url = "https://www.financialexpress.com/market/elon-musk-sells-tesla-shares-worth-4-billion-says-no-more-tesla-stock-sale-planned/2507392/"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [29]:
paragraphs[0].text

'\nReference #18.6a822c31.1651311964.9415d68\n'

In [30]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [31]:
ARTICLE

'\nReference #18.6a822c31.1651311964.9415d68\n'

In [32]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [34]:
summary

'U.S. Department of Defense website: www.dec.army.mil.'

# 4. Building a News and Sentiment Pipeline

In [35]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

## 4.1. Search for Stock News using Google and Yahoo Finance

In [36]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [37]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQPAgE',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=iAVtYvqHMInI1sQP6OqfiA8',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIBygA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICSgC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICigD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICygE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIDCgF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&s

In [38]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQPAgE',
 '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=iAVtYvqHMInI1sQP6OqfiA8',
 '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIBygA',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICSgC',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICigD',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUICygE',
 'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIDCgF',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwj668nNwLv3AhUJpJUCHWj1B_EQ_AUIDSgG',
 '/advanced_search',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa=X&ved=0ahUKEwj66

## 4.2. Strip out unwanted URLs

In [39]:
import re

In [40]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [41]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [42]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-gme-stock-sinks-market-220010614.html',
  'https://finance.yahoo.com/video/taxes-2022-know-reporting-meme-161025450.html',
  'https://finance.yahoo.com/news/taxes-2022-reporting-meme-stock-gains-and-losses-144648811.html',
  'https://finance.yahoo.com/news/bear-day-gamestop-gme-110011956.html',
  'https://finance.yahoo.com/news/nvidia-gamestop-highlighted-zacks-bull-125212967.html',
  'https://finance.yahoo.com/news/u-orthopaedic-partners-announces-leadership-174300930.html',
  'https://finance.yahoo.com/news/zacks-market-edge-highlights-gamestop-100610638.html',
  'https://finance.yahoo.com/video/bed-bath-beyond-earnings-going-155741973.html',
  'https://finance.yahoo.com/news/the-meme-stock-craze-is-pretty-bro-ey-sallie-krawcheck-175923007.html',
  'https://investorplace.com/2022/04/gme-stock-gamestop-requires-a-tactical-approach-moving-forward/'],
 'TSLA': ['https://finance.yahoo.com/longtime-tesla-rival-henrik-fisker-223000049.html'

## 4.3. Search and Scrape Cleaned URLs

In [43]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [44]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["GameStop (GME) closed at $129.31 in the latest trading session, marking a -0.41% move from the prior day. This change lagged the S&P 500's 2.48% gain on the day. Elsewhere, the Dow gained 1.85%, while the tech-heavy Nasdaq lost 0.09%. Heading into today, shares of the video game retailer had lost 22.18% over the past month, lagging the Retail-Wholesale sector's loss of 8.14% and the S&P 500's loss of 7.81% in that time. Investors will be hoping for strength from GameStop as it approaches its next earnings release. The company is expected to report EPS of -$1.37, down 204.44% from the prior-year quarter. Meanwhile, the Zacks Consensus Estimate for revenue is projecting net sales of $1.35 billion, up 5.95% from the year-ago period. Looking at the full year, our Zacks Consensus Estimates suggest analysts are expecting earnings of -$4.93 per share and revenue of $6.38 billion. These totals would mark changes of -8.11% and +6.22%, respectively, from last year. It is also important

In [45]:
articles['TSLA'][2]

'When Tesla (TSLA) reports earnings on Wednesday, all eyes won\'t be on whether Elon Musk comments on his bid for Twitter on the earnings call (or if he is even on the call) but rather how big of a hit the company will experience due to a Shanghai factory production shutdown. "With Berlin and Austin key factories now online and producing Model Y\'s in a quickly ramping pace, the main question for tomorrow is just how bad the China production issues are and what that means for deliveries in 2Q and the rest of the year," said Wedbush analyst Dan Ives. Ives thinks the hit to Tesla could be sizable. "We estimate that roughly 50k units are now reduced for the June quarter for starters given the last three weeks of shutdown and depending on how aggressively Tesla can ramp back production could be impacted further over the next month. Musk & Co. are in a tough spot, as there are so many variables around 2Q China production that will certainly weigh on guidance for the rest of the year and thu

## 4.4. Summarise all Articles

In [46]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [48]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Video game retailer has lost 22.18% over the past month.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'I am not suggesting you short GME, but rather advocating profit-pulling.',
  'NVDA, GME, MOS, and Andersons are among stocks featured by Equity Research.',
  'Kevin Navas named Chief Operating Officer, Rhonda Gibby named Chief Human Resources Officer.',
  'Stock Strategist, Bryan Hayes, joins the show to discuss speculative stocks.',
  'We are aware of the issue and are working to resolve it.',
  'Men were more likely to trade in meme stock craze last year. Trading in viral stocks last January was much more common among men',
  'GME stock is 400% higher than in 2015 when the business was stronger.'],
 'TSLA': ['Social media platform no longer exists for Fisker. Rivals have long been adversaries',
  'Electric-car maker is poised to erase losses for the year. Still, Tesla is more expensive than 

In [49]:
summaries['BTC']

['We are aware of the issue and are working to resolve it.',
 'Institutional interest in Bitcoin is rife. Tech giants like Tesla, Block, and MicroStrategy are investing in crypto',
 'Bill would allow citizens, banks, and legal entities to use cryptos. Latin American country has a reputation as a tax haven',
 'We are aware of the issue and are working to resolve it.',
 'Production and Mining Operations Update for the first quarter ended March 31, 2022.',
 'Lolli lets people earn up to 12% back in bitcoin on purchases. Co-founder and CEO of Bitcoin rewards company Lolli',
 'Kwon’s stablecoin is one of the biggest Bitcoin backers. But crypto critics say it’s too good to be true',
 'New batch of Bitmain Antminer S19j Pro mining machines to be installed at Company’s Washington facility. Additional 4,350 machines to be delivered in batches each month through the end of the year',
 'Largest cryptocurrency has been trading above $40,400 this week. Buyers interest at 10-month low on major excha

# 5. Adding Sentiment Analysis

In [56]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

In [58]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [59]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.987434446811676},
 {'label': 'NEGATIVE', 'score': 0.8728227019309998},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.6467002034187317},
 {'label': 'NEGATIVE', 'score': 0.943663477897644},
 {'label': 'POSITIVE', 'score': 0.9637777805328369},
 {'label': 'NEGATIVE', 'score': 0.9783515334129333},
 {'label': 'NEGATIVE', 'score': 0.9817093014717102},
 {'label': 'NEGATIVE', 'score': 0.50108802318573}]

In [60]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9996449947357178},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.689195990562439},
  {'label': 'POSITIVE', 'score': 0.9907901287078857},
  {'label': 'POSITIVE', 'score': 0.988284707069397},
  {'label': 'POSITIVE', 'score': 0.9957115650177002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9899381399154663},
  {'label': 'POSITIVE', 'score': 0.9967945218086243}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9990410208702087},
  {'label': 'NEGATIVE', 'score': 0.978550910949707},
  {'label': 'NEGATIVE', 'score': 0.939998209476471},
  {'label': 'NEGATIVE', 'score': 0.9701682329177856},
  {'label': 'NEGATIVE', 'score': 0.9992271661758423},
  {'label': 'NEGATIVE', 'score': 0.9938125610351562},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9458198547363281},
  {'label': 'POS

In [61]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

I am not suggesting you short GME, but rather advocating profit-pulling. NEGATIVE 0.689195990562439


In [62]:
scores['BTC'][0]['score']

0.9979088306427002

# 6. Exporting Results to CSV

In [63]:
summaries

{'GME': ['Video game retailer has lost 22.18% over the past month.',
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'I am not suggesting you short GME, but rather advocating profit-pulling.',
  'NVDA, GME, MOS, and Andersons are among stocks featured by Equity Research.',
  'Kevin Navas named Chief Operating Officer, Rhonda Gibby named Chief Human Resources Officer.',
  'Stock Strategist, Bryan Hayes, joins the show to discuss speculative stocks.',
  'We are aware of the issue and are working to resolve it.',
  'Men were more likely to trade in meme stock craze last year. Trading in viral stocks last January was much more common among men',
  'GME stock is 400% higher than in 2015 when the business was stronger.'],
 'TSLA': ['Social media platform no longer exists for Fisker. Rivals have long been adversaries',
  'Electric-car maker is poised to erase losses for the year. Still, Tesla is more expensive than 

In [64]:
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9996449947357178},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.689195990562439},
  {'label': 'POSITIVE', 'score': 0.9907901287078857},
  {'label': 'POSITIVE', 'score': 0.988284707069397},
  {'label': 'POSITIVE', 'score': 0.9957115650177002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9899381399154663},
  {'label': 'POSITIVE', 'score': 0.9967945218086243}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.9990410208702087},
  {'label': 'NEGATIVE', 'score': 0.978550910949707},
  {'label': 'NEGATIVE', 'score': 0.939998209476471},
  {'label': 'NEGATIVE', 'score': 0.9701682329177856},
  {'label': 'NEGATIVE', 'score': 0.9992271661758423},
  {'label': 'NEGATIVE', 'score': 0.9938125610351562},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9458198547363281},
  {'label': 'POS

In [65]:
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/gamestop-gme-stock-sinks-market-220010614.html',
  'https://finance.yahoo.com/video/taxes-2022-know-reporting-meme-161025450.html',
  'https://finance.yahoo.com/news/taxes-2022-reporting-meme-stock-gains-and-losses-144648811.html',
  'https://finance.yahoo.com/news/bear-day-gamestop-gme-110011956.html',
  'https://finance.yahoo.com/news/nvidia-gamestop-highlighted-zacks-bull-125212967.html',
  'https://finance.yahoo.com/news/u-orthopaedic-partners-announces-leadership-174300930.html',
  'https://finance.yahoo.com/news/zacks-market-edge-highlights-gamestop-100610638.html',
  'https://finance.yahoo.com/video/bed-bath-beyond-earnings-going-155741973.html',
  'https://finance.yahoo.com/news/the-meme-stock-craze-is-pretty-bro-ey-sallie-krawcheck-175923007.html',
  'https://investorplace.com/2022/04/gme-stock-gamestop-requires-a-tactical-approach-moving-forward/'],
 'TSLA': ['https://finance.yahoo.com/longtime-tesla-rival-henrik-fisker-223000049.html'

In [66]:
range(len(summaries['GME']))

range(0, 10)

In [67]:
summaries['GME'][3]

'I am not suggesting you short GME, but rather advocating profit-pulling.'

In [68]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [69]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Video game retailer has lost 22.18% over the past month.',
  'NEGATIVE',
  0.9996449947357178,
  'https://finance.yahoo.com/news/gamestop-gme-stock-sinks-market-220010614.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/video/taxes-2022-know-reporting-meme-161025450.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/taxes-2022-reporting-meme-stock-gains-and-losses-144648811.html'],
 ['GME',
  'I am not suggesting you short GME, but rather advocating profit-pulling.',
  'NEGATIVE',
  0.689195990562439,
  'https://finance.yahoo.com/news/bear-day-gamestop-gme-110011956.html'],
 ['GME',
  'NVDA, GME, MOS, and Andersons are among stocks featured by Equity Research.',
  'POSITIVE',
  0.9907901287078857,
  'https://finance.yahoo.com/news/nvidia-gamestop-highlighted-zacks-bull-125212967.html'

In [70]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [71]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Video game retailer has lost 22.18% over the past month.',
  'NEGATIVE',
  0.9996449947357178,
  'https://finance.yahoo.com/news/gamestop-gme-stock-sinks-market-220010614.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/video/taxes-2022-know-reporting-meme-161025450.html'],
 ['GME',
  'We are aware of the issue and are working to resolve it.',
  'POSITIVE',
  0.9979088306427002,
  'https://finance.yahoo.com/news/taxes-2022-reporting-meme-stock-gains-and-losses-144648811.html'],
 ['GME',
  'I am not suggesting you short GME, but rather advocating profit-pulling.',
  'NEGATIVE',
  0.689195990562439,
  'https://finance.yahoo.com/news/bear-day-gamestop-gme-110011956.html'],
 ['GME',
  'NVDA, GME, MOS, and Andersons are among stocks featured by Equity Research.',
  'POSITIVE',
  0.9907901287078857,
  'https://finance.yahoo.com/news/

In [72]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)