In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

## Make summarization model

In [2]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

## Summarize one article

In [4]:
url = "https://finance.yahoo.com/news/palantir-could-dominate-data-field-105336199.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [7]:
paragraphs

[<p>Palantir <strong>(</strong><a class="link rapid-noclick-resp" data-ylk="slk:PLTR" href="https://www.tipranks.com/stocks/pltr/forecast" rel="nofollow noopener" target="_blank"><strong>PLTR</strong></a><strong>) </strong>is a leading data analytics and artificial intelligence software company with a mission of ensuring that the West achieves dominance in the exploding world of data and machine learning.</p>,
 <p>The company's primary customer for its first two decades of operation has been the United States Government, through its Gotham platform. Now the company is investing heavily in growing its Foundry platform, which services medium and large sized companies. It is also growing its Gotham platform to include a greater piece of the United States Government’s spending as well as defense contractors and allied governments. (See <a class="link rapid-noclick-resp" data-ylk="slk:Palantir stock analysis" href="https://www.tipranks.com/stocks/pltr/stock-analysis" rel="nofollow noopener"

In [29]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [30]:
ARTICLE

"Palantir (PLTR) is a leading data analytics and artificial intelligence software company with a mission of ensuring that the West achieves dominance in the exploding world of data and machine learning. The company's primary customer for its first two decades of operation has been the United States Government, through its Gotham platform. Now the company is investing heavily in growing its Foundry platform, which services medium and large sized companies. It is also growing its Gotham platform to include a greater piece of the United States Government’s spending as well as defense contractors and allied governments. (See Palantir stock analysis on TipRanks) Growing Rapidly The main value for PLTR currently comes from its world-class data analytics and artificial intelligence platforms. These enable government agencies and corporations to manage, extract significant value from, and optimize large troves of data. They also provide a very rapid and user-friendly data integration and set-u

In [31]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=80, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [32]:
summary

'Company is investing heavily in its Foundry platform. Management projects that the company will grow by at least 30% per year'

## Make news and sentiment pipeline

In [34]:
monitored_tickers = ['PLTR', 'BB', 'SNPS']

## Search for news with google and yahoo finance

In [36]:
def search_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&rlz=1C5CHFA_enUS831US831&source=lnms&tbm=nws&sa=X&ved=2ahUKEwj4zJvp25fxAhVCLn0KHXZEBcEQ_AUoAXoECAEQAw&biw=1200&bih=688&dpr=2".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [37]:
search_url = "https://www.google.com/search?q=yahoo+finance+{}&rlz=1C5CHFA_enUS831US831&source=lnms&tbm=nws&sa=X&ved=2ahUKEwj4zJvp25fxAhVCLn0KHXZEBcEQ_AUoAXoECAEQAw&biw=1200&bih=688&dpr=2".format('PLTR')

In [38]:
search_url

'https://www.google.com/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&source=lnms&tbm=nws&sa=X&ved=2ahUKEwj4zJvp25fxAhVCLn0KHXZEBcEQ_AUoAXoECAEQAw&biw=1200&bih=688&dpr=2'

In [39]:
search_stock_news_urls('PLTR')

['/?sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQOwgC',
 '/search?q=yahoo+finance+PLTR&tbm=nws&sa=X&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&gbv=1&sei=nZ3HYIKeA5LL0PEPz_OEkAs',
 '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQ_AUIBSgA',
 '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQ_AUIBygC',
 '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQ_AUICCgD',
 '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQ_AUICSgE',
 'https://maps.google.com/maps?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&dpr=2&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjC4Pal3pfxAhWSJTQIHc85AbIQ_AUICigF',
 '/search?q=yahoo+finance+PLTR&rlz=

In [41]:
raw_urls = {}
for ticker in monitored_tickers:
    raw_urls[ticker] = search_stock_news_urls(ticker)

In [42]:
raw_urls

{'PLTR': ['/?sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4QOwgC',
  '/search?q=yahoo+finance+PLTR&tbm=nws&sa=X&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&gbv=1&sei=YJ7HYI6DKq_P0PEPyemr8Ag',
  '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4Q_AUIBSgA',
  '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4Q_AUIBygC',
  '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4Q_AUICCgD',
  '/search?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4Q_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+PLTR&rlz=1C5CHFA_enUS831US831&biw=1200&bih=688&dpr=2&um=1&ie=UTF-8&sa=X&ved=0ahUKEwiOs5uD35fxAhWvJzQIHcn0Co4Q_AUICigF',
  '/search?q=yahoo+f

## clean the links

In [46]:
import re

In [48]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [51]:
def strip_bad_urls(urls, exclude_list):
    val = []
    for url in urls:
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [52]:
strip_bad_urls(raw_urls['PLTR'], exclude_list)

['https://finance.yahoo.com/news/palantir-space-force-expand-partnership-105900988.html',
 'https://finance.yahoo.com/news/george-soros-firm-finally-exits-230142049.html',
 'https://finance.yahoo.com/news/palantir-technologies-strikes-32-5m-120110104.html',
 'https://finance.yahoo.com/news/palantir-strengthens-ties-u-space-075736424.html',
 'https://finance.yahoo.com/news/10-best-saas-stocks-buy-170819461.html',
 'https://finance.yahoo.com/news/palantir-technologies-wins-united-states-170328779.html',
 'https://finance.yahoo.com/news/palantir-awarded-111m-contract-mission-105900419.html',
 'https://finance.yahoo.com/news/palantir-vs-splunk-data-analytics-164334452.html',
 'https://finance.yahoo.com/news/palantir-could-dominate-data-field-105336199.html',
 'https://finance.yahoo.com/news/better-meme-stock-wendys-palantir-132440594.html']

In [53]:
cleaned_urls = {ticker:strip_bad_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}

In [54]:
cleaned_urls

{'PLTR': ['https://finance.yahoo.com/news/palantir-space-force-expand-partnership-105900988.html',
  'https://finance.yahoo.com/news/george-soros-firm-finally-exits-230142049.html',
  'https://finance.yahoo.com/news/palantir-technologies-strikes-32-5m-120110104.html',
  'https://finance.yahoo.com/news/palantir-strengthens-ties-u-space-075736424.html',
  'https://finance.yahoo.com/news/10-best-saas-stocks-buy-170819461.html',
  'https://finance.yahoo.com/news/palantir-technologies-wins-united-states-170328779.html',
  'https://finance.yahoo.com/news/palantir-awarded-111m-contract-mission-105900419.html',
  'https://finance.yahoo.com/news/palantir-vs-splunk-data-analytics-164334452.html',
  'https://finance.yahoo.com/news/palantir-could-dominate-data-field-105336199.html',
  'https://finance.yahoo.com/news/better-meme-stock-wendys-palantir-132440594.html'],
 'BB': ['https://uk.finance.yahoo.com/news/blackberry-bb-share-price-explode-144613189.html',
  'https://finance.yahoo.com/news/1-me

## scrape from urls

In [78]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:300]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [79]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}

In [80]:
articles

{'PLTR': ['Palantir Technologies Inc. (NYSE: PLTR) confirmed today it will support the United States Space Force and United States Air Force by providing its software to the critical missions of the Department of the Air Force (DAF), Space and Missile Systems Center’s Cross-Mission Ground & Communications Enterprise (SMC/ECX), and NORAD-NORTHCOM. This is a firm-fixed-price award totaling $32.5 million. Palantir will deploy and maintain Palantir as Data-as-a-Service (DaaS) platform to support SMC/ECX’s Space Command and Control program element, including operational users at both the National Space Defense Center and the Combined Space Operations Center. Department of the Air Force’s Project Brown Heron will use Palantir to provide the U.S. Air Force senior leadership with a continuously improving operational readiness analytics platform by integrating disparate data sources from across the Service. This will improve the DAF’s readiness in a variety of mission-critical areas, cutting ac

In [81]:
articles['PLTR'][2]

'Palantir Technologies Inc (NYSE: PLTR) offered its software to the critical missions of the Department of the Air Force (DAF), Space and Missile Systems Center’s Cross-Mission Ground & Communications Enterprise (SMC/ECX), and NORAD-NORTHCOM. The aggregate firm-fixed-price award was worth $32.5 million. Palantir as Data-as-a-Service (DaaS) platform will support SMC/ECX’s Space Command and Control program element at both the National Space Defense Center and the Combined Space Operations Center. Department of the Air Force’s Project Brown Heron will utilize Palantir’s operational readiness analytics platform. Additionally, Palantir will support NORAD-NORTHCOM’s Joint All Domain Command and Control (JADC2) transformation, ingesting, and modeling high-scale data. Price action: PLTR shares traded higher by 1.35% at $21.03 in the premarket session on the last check Monday. See more from Benzinga Click here for options trades from Benzinga Palantir Stock Falls After Beating Q1 Revenue, Dip I

## summarize

In [82]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [123]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'PLTR': ['Firm-fixed-price award totaling $32.5 million. Palantir to support SMC/ECX’s Space Command and Control program',
  "The firm's top stock buys for the quarter were Amazon.com, Palantir Technologies.",
  'Firm-fixed-price award was worth $32.5 million. Palantir to support Air Force’s Project Brown Heron, NORAD-NORTHCOM',
  'Software company will provide software for advanced critical missions. Palantir reported 49% revenue growth year-over-year in Q1',
  'Big lenders are pouring billions into software companies for long-term returns.',
  'The contract includes a base year and an additional one-year extension option. Shares of PLTR have jumped 141.6% over the past year',
  'The contract is valued at a total of $111 million, inclusive of options. Palantir’s platform has been used by USSOCOM in real-time mission operations',
  'Palantir Technologies and Splunk are in the top two stocks in their sectors.',
  'Palantir is a leading data analytics and artificial intelligence softwar

## Sentiment analysis

In [84]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [86]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}

In [124]:
scores

{'PLTR': [{'label': 'NEGATIVE', 'score': 0.9099695682525635},
  {'label': 'NEGATIVE', 'score': 0.9482470750808716},
  {'label': 'POSITIVE', 'score': 0.9685001969337463},
  {'label': 'POSITIVE', 'score': 0.9863005876541138},
  {'label': 'NEGATIVE', 'score': 0.9965274930000305},
  {'label': 'NEGATIVE', 'score': 0.9613101482391357},
  {'label': 'POSITIVE', 'score': 0.9307973980903625},
  {'label': 'POSITIVE', 'score': 0.997609555721283},
  {'label': 'POSITIVE', 'score': 0.999591052532196},
  {'label': 'POSITIVE', 'score': 0.999779999256134}],
 'BB': [{'label': 'NEGATIVE', 'score': 0.9924120306968689},
  {'label': 'NEGATIVE', 'score': 0.993604838848114},
  {'label': 'NEGATIVE', 'score': 0.9263762831687927},
  {'label': 'POSITIVE', 'score': 0.9932146668434143},
  {'label': 'NEGATIVE', 'score': 0.997467577457428},
  {'label': 'NEGATIVE', 'score': 0.9942097067832947},
  {'label': 'NEGATIVE', 'score': 0.9977350234985352},
  {'label': 'NEGATIVE', 'score': 0.985684335231781},
  {'label': 'POSITI

In [92]:
print(summaries['SNPS'][7], scores['SNPS'][7]['label'], scores['SNPS'][7]['score'])

Estimates for next quarter and full year have moved considerably higher. POSITIVE 0.9749597907066345


## export

In [125]:
summaries

{'PLTR': ['Firm-fixed-price award totaling $32.5 million. Palantir to support SMC/ECX’s Space Command and Control program',
  "The firm's top stock buys for the quarter were Amazon.com, Palantir Technologies.",
  'Firm-fixed-price award was worth $32.5 million. Palantir to support Air Force’s Project Brown Heron, NORAD-NORTHCOM',
  'Software company will provide software for advanced critical missions. Palantir reported 49% revenue growth year-over-year in Q1',
  'Big lenders are pouring billions into software companies for long-term returns.',
  'The contract includes a base year and an additional one-year extension option. Shares of PLTR have jumped 141.6% over the past year',
  'The contract is valued at a total of $111 million, inclusive of options. Palantir’s platform has been used by USSOCOM in real-time mission operations',
  'Palantir Technologies and Splunk are in the top two stocks in their sectors.',
  'Palantir is a leading data analytics and artificial intelligence softwar

In [94]:
scores

{'PLTR': [{'label': 'NEGATIVE', 'score': 0.9099695682525635},
  {'label': 'NEGATIVE', 'score': 0.9482470750808716},
  {'label': 'POSITIVE', 'score': 0.9685001969337463},
  {'label': 'POSITIVE', 'score': 0.9863005876541138},
  {'label': 'NEGATIVE', 'score': 0.9965274930000305},
  {'label': 'NEGATIVE', 'score': 0.9613101482391357},
  {'label': 'POSITIVE', 'score': 0.9307973980903625},
  {'label': 'POSITIVE', 'score': 0.997609555721283},
  {'label': 'POSITIVE', 'score': 0.999591052532196},
  {'label': 'POSITIVE', 'score': 0.999779999256134}],
 'BB': [{'label': 'NEGATIVE', 'score': 0.9924120306968689},
  {'label': 'NEGATIVE', 'score': 0.993604838848114},
  {'label': 'NEGATIVE', 'score': 0.9263762831687927},
  {'label': 'POSITIVE', 'score': 0.9932146668434143},
  {'label': 'NEGATIVE', 'score': 0.997467577457428},
  {'label': 'NEGATIVE', 'score': 0.9942097067832947},
  {'label': 'NEGATIVE', 'score': 0.9977350234985352},
  {'label': 'NEGATIVE', 'score': 0.985684335231781},
  {'label': 'POSITI

In [95]:
cleaned_urls

{'PLTR': ['https://finance.yahoo.com/news/palantir-space-force-expand-partnership-105900988.html',
  'https://finance.yahoo.com/news/george-soros-firm-finally-exits-230142049.html',
  'https://finance.yahoo.com/news/palantir-technologies-strikes-32-5m-120110104.html',
  'https://finance.yahoo.com/news/palantir-strengthens-ties-u-space-075736424.html',
  'https://finance.yahoo.com/news/10-best-saas-stocks-buy-170819461.html',
  'https://finance.yahoo.com/news/palantir-technologies-wins-united-states-170328779.html',
  'https://finance.yahoo.com/news/palantir-awarded-111m-contract-mission-105900419.html',
  'https://finance.yahoo.com/news/palantir-vs-splunk-data-analytics-164334452.html',
  'https://finance.yahoo.com/news/palantir-could-dominate-data-field-105336199.html',
  'https://finance.yahoo.com/news/better-meme-stock-wendys-palantir-132440594.html'],
 'BB': ['https://uk.finance.yahoo.com/news/blackberry-bb-share-price-explode-144613189.html',
  'https://finance.yahoo.com/news/1-me

In [128]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [129]:
final_output = create_output_array(summaries, scores, cleaned_urls)

In [130]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [131]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['PLTR',
  'Firm-fixed-price award totaling $32.5 million. Palantir to support SMC/ECX’s Space Command and Control program',
  'NEGATIVE',
  0.9099695682525635,
  'https://finance.yahoo.com/news/palantir-space-force-expand-partnership-105900988.html'],
 ['PLTR',
  "The firm's top stock buys for the quarter were Amazon.com, Palantir Technologies.",
  'NEGATIVE',
  0.9482470750808716,
  'https://finance.yahoo.com/news/george-soros-firm-finally-exits-230142049.html'],
 ['PLTR',
  'Firm-fixed-price award was worth $32.5 million. Palantir to support Air Force’s Project Brown Heron, NORAD-NORTHCOM',
  'POSITIVE',
  0.9685001969337463,
  'https://finance.yahoo.com/news/palantir-technologies-strikes-32-5m-120110104.html'],
 ['PLTR',
  'Software company will provide software for advanced critical missions. Palantir reported 49% revenue growth year-over-year in Q1',
  'POSITIVE',
  0.9863005876541138,
  'https://finance.yahoo.com/news/palant

In [132]:
import csv
with open('ticketsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)