<a href="https://colab.research.google.com/github/mreskandarinasab/Automate-Stocks-and-Crypto-News-Research-Preprocessing-Text-Summarization-Sentiment-Analysis-by-/blob/main/Stock_and_Crypto_News_ScrapingSummarizationSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install and Import Baseline Dependencies

In [None]:
!pip install sentencepiece
!pip install transformers

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests


# 2. Setup Summarization Model

In [None]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# 3. Summarize a Single Article

In [None]:
url = "https://finance.yahoo.com/news/apple-treasurer-keeper-200-billion-210149390.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [None]:
paragraphs[1].text

'(Bloomberg) -- Apple Inc.’s corporate treasurer and keeper of its nearly $200 billion money pile has retired from the iPhone maker after about 35 years, according to people with knowledge of the matter.'

In [None]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [None]:
ARTICLE

' (Bloomberg) -- Apple Inc.’s corporate treasurer and keeper of its nearly $200 billion money pile has retired from the iPhone maker after about 35 years, according to people with knowledge of the matter. Most Read from Bloomberg Wall Street Titans Warn of the Next Big Risks for Investors The Unstoppable Appeal of Highway Expansion The Country That Makes Breakfast for the World Is Plagued by Fire, Frost and Drought HSBC Bets Big on China as Pressure Mounts in London An Unapologetic Old Boys’ Network Is Costing Australia Billions Gary Wipfler stepped down in recent weeks, said the people, who asked not to be identified because the move hasn’t been announced. He oversaw the iPhone maker’s cash balance, investments and capital-return programs, and he was once a fixture of Apple’s quarterly earnings calls. The 62-year-old executive reported to Chief Financial Officer Luca Maestri until his retirement. An Apple spokesman declined to comment. Wipfler’s retirement caps a career that spanned s

In [None]:
input_ids = tokenizer(ARTICLE, return_tensors="pt").input_ids
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
summary

'Gary Wipfler oversaw the iPhone maker’s cash, investments. He was once a fixture on quarterly earnings calls'

# 4. Building a News and Sentiment Pipeline

In [None]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

# 4.1. Search for Stock News using Google and Yahoo Finance

In [None]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs

In [None]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

In [None]:
raw_urls['GME']

# 4.2. Strip out unwanted URLs

In [None]:
import re

In [None]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [None]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [None]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'BTC': ['https://finance.yahoo.com/news/crypto-stocks-jump-bitcoin-other-100330525.html',
  'https://finance.yahoo.com/news/bitcoin-posts-biggest-increase-since-131031923.html',
  'https://finance.yahoo.com/news/why-bitcoin-related-ethereum-related-130031845.html',
  'https://finance.yahoo.com/news/crypto-daily-movers-shakers-october-002227591.html',
  'https://finance.yahoo.com/news/bitfarms-provides-bitcoin-production-mining-110000820.html',
  'https://finance.yahoo.com/news/marathon-riot-blockchain-surge-bitcoin-123802408.html',
  'https://finance.yahoo.com/news/central-bank-report-highlights-risk-in-the-race-to-digital-currencies-114759021.html',
  'https://finance.yahoo.com/news/britcoin-millionaires-mt-gox-case-japan-153624083-230116218.html',
  'https://finance.yahoo.com/news/bitcoin-rises-5-2-43-221833094.html',
  'https://finance.yahoo.com/news/bitcoin-ethereum-rise-venezuela-launches-digital-currency-081104851.html'],
 'GME': ['https://finance.yahoo.com/news/gamestop-reports

# 4.3. Search and Scrape Cleaned URLs

In [None]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [None]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

In [None]:
articles['TSLA'][2]

' (Bloomberg) -- Cathie Wood sold a near $270 million stake in Tesla Inc. as the bond selloff hit rate-sensitive technology stocks to spur outflows from her growth-focused funds. Most Read from Bloomberg The Country That Makes Breakfast for the World Is Plagued by Fire, Frost and Drought The Unstoppable Appeal of Highway Expansion HSBC Bets Big on China as Pressure Mounts in London How Los Angeles Became the City of Dingbats Why the Gaza Strip May Be the City of the Future Wood’s Ark Investment Management offloaded more than 340,000 Tesla shares across three exchange-traded funds on Tuesday, according to the firm’s daily trading update. Some 11% of the famous ARK Innovation ETF (ticker ARKK) is still betting on Elon Musk’s company, according to data compiled by Bloomberg. The firm tends to trim the stake when it rises above 10%. Tesla has generally outperformed in the global rout hitting rate-sensitive investing styles, while ARKK posted one of its worst sessions in months on Tuesday. 

# 4.4. Summarise all Articles

In [None]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [None]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

In [None]:
summaries['BTC']

['Bitcoin jumps 12% on Friday, nears $48,000-mark. Moderna, Novavax tumble as healthcare stocks drag',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Bitfarms mines 1,050 BTC in third quarter, up 38% over second quarter. Deposited 2,312 BTC into custody through September 30, representing 96% of 2021 production',
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Creditors have been waiting almost a decade for payouts. Bitcoin has soared in value more than 100 times since 2014',
 'We are aware of the issue and are working to resolve it.',
 'Venezuela launches digital bolivar. Fed chair says stablecoins are ‘outside regulatory perimeter’']

# 5. Adding Sentiment Analysis

In [None]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

In [None]:
sentiment(summaries['BTC'])

  cpuset_checked))


[{'label': 'NEGATIVE', 'score': 0.9995713829994202},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9866390228271484},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9985235333442688},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9325519800186157}]

In [None]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

In [None]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

Market maker Citadel Securities made a series of posts on Twitter. Claim that CEO Ken Griffin had never spoken to Vlad Tenev NEGATIVE 0.9960700273513794


In [None]:
scores['BTC'][0]['score']

0.9995713829994202

# 6. Exporting Results to CSV

In [None]:
summaries

In [None]:
scores

In [None]:
cleaned_urls

In [None]:
range(len(summaries['GME']))

range(0, 10)

In [None]:
summaries['GME'][3]

'Market maker Citadel Securities made a series of posts on Twitter. Claim that CEO Ken Griffin had never spoken to Vlad Tenev'

In [None]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [None]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

In [None]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [None]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'Second quarter net sales of $1.183 billion compared to $942 million in the prior year. Cash and restricted cash of $1.78 billion ended the period',
  'NEGATIVE',
  0.9525899291038513,
  'https://finance.yahoo.com/news/gamestop-reports-financial-results-q2-200500919.html'],
 ['GME',
  "‘Retail factor is here to stay,' says Siebert Williams Shank. Off-exchange trading made up 47.2% of equity volume in January 2021",
  'POSITIVE',
  0.9101589918136597,
  'https://finance.yahoo.com/news/significant-role-of-retail-investing-here-to-stay-suzanne-shank-164155648.html'],
 ['GME',
  'The ‘Apes’ have become a rallying cry for retail traders. AMC Entertainment, GME are among top 10 most-shorted stocks',
  'NEGATIVE',
  0.9831770062446594,
  'https://finance.yahoo.com/news/matt-kohrs-rise-amc-gme-153039161.html'],
 ['GME',
  'Market maker Citadel Securities made a series of posts on Twitter. Claim that CEO Ken Griffin had never spok

In [None]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)