In [4]:
!pip install transformers



In [7]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 1.2 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install torch==1.9.1+cpu torchvision==0.10.1+cpu torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.9.1+cpu
  Downloading https://download.pytorch.org/whl/cpu/torch-1.9.1%2Bcpu-cp38-cp38-linux_x86_64.whl (175.4 MB)
[K     |████████████████████████████████| 175.4 MB 23 kB/s  eta 0:00:01    |████▋                           | 25.5 MB 3.2 MB/s eta 0:00:47     |██████████████▊                 | 80.9 MB 2.9 MB/s eta 0:00:34     |███████████████████████████▌    | 150.6 MB 5.5 MB/s eta 0:00:05
[?25hCollecting torchvision==0.10.1+cpu
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.10.1%2Bcpu-cp38-cp38-linux_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 2.7 MB/s eta 0:00:01    |████████████████▏               | 7.9 MB 2.6 MB/s eta 0:00:04
[?25hCollecting torchaudio==0.9.1
  Downloading torchaudio-0.9.1-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 1.2 MB/s eta 0:00:01
Installing collected packages: torch, torchvision

# 1. Setup Sumarization Model

In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

2021-10-01 00:54:29.353391: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-01 00:54:29.353407: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

# 2. Testing summarize a single article
https://finance.yahoo.com/news/bitcoin-price-prediction-bulls-eye-114742508.html

In [20]:
url = "https://finance.yahoo.com/news/bitcoin-price-prediction-bulls-eye-114742508.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [22]:
paragraphs[1].text

'After a bullish day for Bitcoin and the majors on Wednesday, it’s been a broadly bullish Wednesday morning.'

In [37]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:350]
ARTICLE = ' '.join(words)

In [38]:
ARTICLE

' After a bullish day for Bitcoin and the majors on Wednesday, it’s been a broadly bullish Wednesday morning. At the time of writing, Bitcoin, BTC to USD, was up by 3.75% to $43,086.1. A mixed start to the day saw Bitcoin fall to an early morning low $41,432.0 before making a move. Steering clear of the first major support level at $40,687, Bitcoin rallied to a mid-morning current day high $43,860. Bitcoin broke through the first major resistance level at $42,489 and the 38.2% FIB of $41,592. The early rally saw Bitcoin also break through the second major resistance level at $43,449 before revising sub-$43,000 levels. It has been a mixed morning for the broader crypto market. At the time of writing, Polkadot was down by 1.61% to buck the morning trend. It’s been a bullish morning for the rest of the majors, however. Chainlink (+3.92%), Ethereum (+3.81%), and Litecoin (+4.11%) set the morning pace. Binance Coin (+1.39%), Bitcoin Cash SV (+2.04%), Cardano’s ADA (+0.84%), Crypto.com Coin 

In [39]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')

In [43]:
output = model.generate(
    input_ids, 
    max_length=55, 
    num_beams=5, 
    early_stopping=True
)

In [44]:
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [45]:
summary

'Largest cryptocurrency has rallied to a mid-morning high. Chainlink, Ethereum, and Litecoin set the morning pace'

# 3. Build Pipeline

In [47]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

## 3.1. Search for news using Google & Yahoo Finance

In [46]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [48]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQPAgE',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=ewFWYdqxG4eEr7wP6bWxgAc',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQ_AUIBygA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQ_AUICSgC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQ_AUICigD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQ_AUICygE',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQ_AUIDCgF',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1vi&lr=lang_vi&sa=X&ved=0ahUKEwia2_fAqafzAhUHwosBHelaDHAQpwUIDg',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&sour

In [49]:
raw_urls['BTC']

['/?sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0QOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0QPAgE',
 '/search?q=yahoo+finance+BTC&tbm=nws&ie=UTF-8&gbv=1&sei=fAFWYcK6JazEmAWp4proCA',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0Q_AUIBygA',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0Q_AUICSgC',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0Q_AUICigD',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0Q_AUICygE',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0Q_AUIDCgF',
 '/advanced_search',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1vi&lr=lang_vi&sa=X&ved=0ahUKEwjC6L7BqafzAhUsIqYKHSmxBo0QpwUIDg',
 '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&sa

## 3.2. Remove unwanted URLS

In [50]:
import re

In [51]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [52]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [53]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html',
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html',
  'https://finance.yahoo.com/news/15-best-short-squeeze-stocks-140610450.html',
  'https://finance.yahoo.com/news/why-rf-capital-management-continues-164430403.html',
  'https://finance.yahoo.com/news/gamestop-gme-hire-500-staff-135001043.html',
  'https://finance.yahoo.com/news/gamestop-nyse-gme-earnings-120000520.html',
  'https://ca.finance.yahoo.com/news/amc-gme-stocks-citadel-drama-123112651.html',
  'https://finance.yahoo.com/news/gamestop-reports-financial-results-q2-200500919.html',
  'https://finance.yahoo.com/news/disheartening-gamestop-gme-earnings-r-212009089.html',
  'https://finance.yahoo.com/news/jim-simons-quant-hedge-fund-171007244.html'],
 'TSLA': ['https://finance.yahoo.com/news/tesla-loses-china-fraud-case-134417684.html',
  'https://finance.yahoo.co

## 3.3. Search and Scrape Cleaned URLS

In [54]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [55]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': [' Not even speculation of a partnership with fellow meme stock trader favorite AMC Entertainment has been enough to arrest the selling pressure in shares of one-time 2021 market darling GameStop (GME).\xa0 Shares of the gaming retailer turned self-described tech company have dropped 12% so far in September, far worse than the 2% respective drops in the S&P 500 and Nasdaq Composite. GameStop\'s stock is also among the laggards of the popular meme stock group this month — AMC\'s stock has gained 8.5% and Clover Health has tacked on nearly 5%. BlackBerry\'s stock is down 8% on the month.\xa0 GameStop shares are down 46% since hitting a record high on Jan. 27. The pullback in GameStop stock — at least as far as September goes — coincides with several factors.\xa0 First, the stock appears to be receiving less love on the WallStreetBets Reddit platform that made it an icon of retail investors to kick off the year.\xa0 The stock has seen plunging popularity among Redditors since mid-

In [57]:
articles['BTC'][3]

'Following a huge surge in cryptocurrency prices this year, crypto is going through a consolidation period, Galaxy Digital Holdings Ltd (Pink: BRPHF) CEO Mike Novogratz said Wednesday on CNBC\'s "Squawk Box." Levels To Watch: $40,000 is an important level for Bitcoin (CRYPTO: BTC) to hold, Novogratz said, adding that if it falls below $40,000, $38,000 will be the next level to watch. For Ethereum (CRYPTO: ETH), the $2,800 level "seems to be an important level to hold." Novogratz noted the two most popular cryptocurrencies have traded within those ranges during the consolidation period. Related Link: Why Bitcoin Price Action Below K Could Be \'Fast And Volatile\' What\'s Next: Novogratz told CNBC he is still anticipating a surge in cryptocurrency prices before the end of the year. "My guess is it\'s not until halfway to a third of the way through the fourth quarter that you see the next surge," he said. The asset classes that perform the best throughout the year tend to be the ones that

## 3.4. Summarise all Articles

In [58]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [59]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['Shares of the self-described tech company are down 12% so far in September. One could be punishing meme trader crowd for lack of clarity',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  'Hedge funds have lost billions of dollars in the short squeeze battle. Retail investors are slowly transforming market dynamics',
  'RF Capital Management published its second-quarter 2021 investor letter. GME returned 977.81% since the beginning of the year',
  'Plans to hire nearly 500 employees at new customer service center in Pembroke Pines, FL. Company has been undertaking radical digital transformation efforts',
  'Shares dip more than 10% in after-hours trading despite strong report. Management says they’re investing in long-term growth initiatives',
  'Market maker Citadel Securities made a series of posts on Twitter. Claim that CEO Ken Griffin had never spoken to Vlad Tenev',
  'Second quarter 

In [60]:
summaries['BTC']

['We are aware of the issue and are working to resolve it.',
 'Anthony Pompliano sees China crackdown as positive for U.S. economy. Chainalysis says East Asia already saw a decline in crypto trading volume',
 'We are aware of the issue and are working to resolve it.',
 'Galaxy Digital CEO expects more chop in crypto. Novogratz still expects strong fourth quarter for crypto',
 "ETC is Canada's first multicurrency ETF. Ether and bitcoin make up around 65 per cent of the cryptocurrency market",
 'Bitcoin Latinum is a greener, faster, and more secure version of Bitcoin.',
 'Shares of Marathon Digital, Riot Blockchain and Bit Digital are trading lower.',
 'We are aware of the issue and are working to resolve it.',
 'Early repayment of loans payable which are repayable in Bitcoin on September 29, 2021.',
 'Chainlink, Bitcoin Cash SV lead crypto market cap higher through early hours.']

# 5. Adding Sentiment Analysis

In [61]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [62]:
sentiment(summaries['BTC'])

[{'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9259711503982544},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9979314804077148},
 {'label': 'POSITIVE', 'score': 0.986530601978302},
 {'label': 'POSITIVE', 'score': 0.99866783618927},
 {'label': 'NEGATIVE', 'score': 0.9996521472930908},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.7991209030151367},
 {'label': 'NEGATIVE', 'score': 0.9874210357666016}]

In [64]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9997784495353699},
  {'label': 'POSITIVE', 'score': 0.9101589918136597},
  {'label': 'POSITIVE', 'score': 0.9760345220565796},
  {'label': 'NEGATIVE', 'score': 0.720937967300415},
  {'label': 'POSITIVE', 'score': 0.9902737736701965},
  {'label': 'NEGATIVE', 'score': 0.9926254153251648},
  {'label': 'NEGATIVE', 'score': 0.9960700273513794},
  {'label': 'NEGATIVE', 'score': 0.9525899291038513},
  {'label': 'POSITIVE', 'score': 0.9984837174415588},
  {'label': 'POSITIVE', 'score': 0.9994992017745972}],
 'TSLA': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9874534010887146},
  {'label': 'POSITIVE', 'score': 0.9991242289543152},
  {'label': 'NEGATIVE', 'score': 0.995489776134491},
  {'label': 'NEGATIVE', 'score': 0.9994449019432068},
  {'label': 'POSITIVE', 'score': 0.9931678175926208},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9844943881034851},
  {'label': 'N

In [65]:
print(summaries['BTC'][3], scores['BTC'][3]['label'], scores['BTC'][3]['score'])

Galaxy Digital CEO expects more chop in crypto. Novogratz still expects strong fourth quarter for crypto NEGATIVE 0.9979314804077148


# 6. Export Results to CSV

In [66]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [67]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'Shares of the self-described tech company are down 12% so far in September. One could be punishing meme trader crowd for lack of clarity',
  'NEGATIVE',
  0.9997784495353699,
  'https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html'],
 ['GME',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  'POSITIVE',
  0.9101589918136597,
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html'],
 ['GME',
  'Hedge funds have lost billions of dollars in the short squeeze battle. Retail investors are slowly transforming market dynamics',
  'POSITIVE',
  0.9760345220565796,
  'https://finance.yahoo.com/news/15-best-short-squeeze-stocks-140610450.html'],
 ['GME',
  'RF Capital Management published its second-quarter 2021 investor letter. GME returned 977.81% since the beginning of the year',
  'NEGATIVE',
  0.7

In [68]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [69]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Shares of the self-described tech company are down 12% so far in September. One could be punishing meme trader crowd for lack of clarity',
  'NEGATIVE',
  0.9997784495353699,
  'https://finance.yahoo.com/news/game-stop-stock-is-enduring-an-awful-september-180120497.html'],
 ['GME',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  'POSITIVE',
  0.9101589918136597,
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html'],
 ['GME',
  'Hedge funds have lost billions of dollars in the short squeeze battle. Retail investors are slowly transforming market dynamics',
  'POSITIVE',
  0.9760345220565796,
  'https://finance.yahoo.com/news/15-best-short-squeeze-stocks-140610450.html'],
 ['GME',
  'RF Capital Management published its second-quarter 2021 investor letter. GME returned 977.81%

In [70]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [71]:
import pandas as pd

In [73]:
df = pd.DataFrame(final_output)

In [75]:
df.head()

Unnamed: 0,0,1,2,3,4
0,Ticker,Summary,Label,Confidence,URL
1,GME,Shares of the self-described tech company are ...,NEGATIVE,0.999778,https://finance.yahoo.com/news/game-stop-stock...
2,GME,"‘Retail factor is here to stay,' says Siebert ...",POSITIVE,0.910159,https://finance.yahoo.com/news/significant-rol...
3,GME,Hedge funds have lost billions of dollars in t...,POSITIVE,0.976035,https://finance.yahoo.com/news/15-best-short-s...
4,GME,RF Capital Management published its second-qua...,NEGATIVE,0.720938,https://finance.yahoo.com/news/why-rf-capital-...


In [87]:
df[2].value_counts()

NEGATIVE    15
POSITIVE    15
Label        1
Name: 2, dtype: int64