# Scrape news and Analyse sentiments
This notebook shows an example of scraping news articles linked to specific traded companies and utilizing our predeployed sentiment analysis model server to predict the sentiment of the author towards said companies.

In [None]:
# nuclio: ignore
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio 

In [None]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [None]:
%%nuclio cmd 
pip install beautifulsoup4
pip install pandas
pip install v3io_frames

In [None]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

In [18]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import requests
import pandas as pd
import v3io_frames as v3f
from unicodedata import normalize
from datetime import datetime
import re
import os

# Change this to the endpoint provided at the end of execution of 00-deploy-sentiment-model.ipynb.
ENDPOINT = 'http://192.168.224.185:32181/'
sym_to_url={'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc', 'AAPL': 'apple-computer-inc'}
client = v3f.Client('framesd:8081', container='bigdata')

def get_stock_news_page(stock_string):
    request = Request('https://www.investing.com/equities/' + stock_string + '-news', headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def get_internal_article_links(page):
    news = page.find_all('div', attrs={'class': 'mediumTitle1'})[1]
    articles = news.find_all('article', attrs={'class': 'js-article-item articleItem'})
    return ['https://www.investing.com' + a.find('a').attrs['href'] for a in articles]

def get_article_page(article_link):
    request = Request(article_link, headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def clean_paragraph(paragraph):
    paragraph = re.sub(r'\(http\S+', '', paragraph)
    paragraph = re.sub(r'\([A-Z]+:[A-Z]+\)', '', paragraph)
    paragraph = re.sub(r'[\n\t\s\']', ' ', paragraph)
    return normalize('NFKD', paragraph)    

def extract_text(article_page):
    text_tag = article_page.find('div', attrs={'class': 'WYSIWYG articlePage'})
    paragraphs = text_tag.find_all('p')
    text = '\n'.join([clean_paragraph(p.get_text()) for p in paragraphs[:-1]])
    return text

def get_publish_time(article_page):
    details = article_page.find('meta', attrs={'itemprop': 'datePublished'})
    publish_date = details.get_attribute_list('content')[0]
    return str(datetime.strptime(publish_date, '%Y-%m-%d'))

def get_score(paragraph_scores):
    return sum([score - 1 for score in paragraph_scores]) / len(paragraph_scores)  

def get_article_scores(articles, endpoint):
    scores = [] 
    for i, article in enumerate(articles):
        print(f'getting score for article {i + 1}\\{len(articles)}')
        event_data = {'instances': article.split('\n')}
        resp = requests.put(endpoint+'/bert_classifier_v1/predict', json=json.dumps(event_data))
        scores.append(get_score(json.loads(resp.text)))
    return scores
    
def handler(context, handler):
    
    syms = []
    contents = []
    links = []
    times = []
    sentiments = []
    
    for sym, url_string in sym_to_url.items():
        news_page = get_stock_news_page(url_string)
        article_links = get_internal_article_links(news_page)
        article_pages = [get_article_page(link) for link in article_links]
        articles = [extract_text(article_page) for article_page in article_pages]
        curr_sentiments = get_article_scores(articles, ENDPOINT)
        curr_times = [get_publish_time(article_page) for article_page in article_pages]
        
        sentiments += curr_sentiments
        times += curr_times
        for article, link in zip(articles, article_links):
            syms.append(sym)
            contents.append(article)
            links.append(link)
    
    
    for i in range(len(contents)):
        record = {
            'content': contents[i],
            'time': times[i],
            'symbol': syms[i],
            'link': links[i],
            'sentiment': sentiments[i]
        }
        
        client.execute('stream', 'stock_stream', 'put', args={'data': json.dumps(record)})
    

In [None]:
#nuclio: end-code

### Local Activation

In [15]:
from nuclio_sdk import Event

event = Event()

In [16]:
handler(context, event)

getting score for article 1\6
getting score for article 2\6
getting score for article 3\6
getting score for article 4\6
getting score for article 5\6
getting score for article 6\6
getting score for article 1\8
getting score for article 2\8
getting score for article 3\8
getting score for article 4\8
getting score for article 5\8
getting score for article 6\8
getting score for article 7\8
getting score for article 8\8
getting score for article 1\7
getting score for article 2\7
getting score for article 3\7
getting score for article 4\7
getting score for article 5\7
getting score for article 6\7
getting score for article 7\7
getting score for article 1\6
getting score for article 2\6
getting score for article 3\6
getting score for article 4\6
getting score for article 5\6
getting score for article 6\6


### Deploy as Serverless Function

In [21]:
%nuclio deploy -p stocks -n read-news

[nuclio] 2020-08-06 09:22:23,007 (info) Build complete
[nuclio] 2020-08-06 09:22:29,080 (info) Function deploy complete
[nuclio] 2020-08-06 09:22:29,087 done updating read-news, function address: 192.168.224.185:32355
%nuclio: function deployed
