# Scrape news and Analyse sentiments
This notebook shows an example of scraping news articles linked to specific traded companies and utilizing our predeployed sentiment analysis model server to predict the sentiment of the author towards said companies.

In [1]:
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio 

## Environment

In [2]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [3]:
%%nuclio cmd -c
pip install beautifulsoup4
pip install pandas
pip install v3io_frames

In [4]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


## Function

In [5]:
# nuclio: start-code

In [6]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import requests
import pandas as pd
import v3io_frames as v3f
from unicodedata import normalize
from datetime import datetime
import re
import os
import json

In [7]:
def get_stock_news_page(stock_string):
    request = Request('https://www.investing.com/equities/' + stock_string + '-news', headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def get_internal_article_links(page):
    news = page.find_all('div', attrs={'class': 'mediumTitle1'})[1]
    articles = news.find_all('article', attrs={'class': 'js-article-item articleItem'})
    return ['https://www.investing.com' + a.find('a').attrs['href'] for a in articles]

def get_article_page(article_link):
    request = Request(article_link, headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def clean_paragraph(paragraph):
    paragraph = re.sub(r'\(http\S+', '', paragraph)
    paragraph = re.sub(r'\([A-Z]+:[A-Z]+\)', '', paragraph)
    paragraph = re.sub(r'[\n\t\s\']', ' ', paragraph)
    return normalize('NFKD', paragraph)    

def extract_text(article_page):
    text_tag = article_page.find('div', attrs={'class': 'WYSIWYG articlePage'})
    paragraphs = text_tag.find_all('p')
    text = '\n'.join([clean_paragraph(p.get_text()) for p in paragraphs[:-1]])
    return text

def get_publish_time(article_page):
    details = article_page.find('meta', attrs={'itemprop': 'dateModified'})
    publish_date = details.get_attribute_list('content')[0]
    return str(datetime.strptime(publish_date, '%Y-%m-%d %H:%M:%S'))

def get_score(paragraph_scores):
    return sum([score - 1 for score in paragraph_scores]) / len(paragraph_scores)  

def get_article_scores(articles, endpoint):
    scores = [] 
    for i, article in enumerate(articles):
        context.logger.info(f'getting score for article {i + 1}\\{len(articles)}')
        event_data = {'instances': article.split('\n')}
        resp = requests.put(endpoint+'/bert_classifier_v1/predict', json=json.dumps(event_data))
        scores.append(get_score(json.loads(resp.text)))
    return scores

In [8]:
def init_context(context):
    # Setup V3IO Client
    client = v3f.Client('framesd:8081',container=os.getenv('V3IO_CONTAINER', 'bigdata'))
    setattr(context, 'v3c', client)
    
    # Create stocks stream
    setattr(context, 'stocks_stream', os.getenv('STOCKS_STREAM', 'stocks/stocks_stream'))
    context.v3c.create(backend='stream', table=context.stocks_stream, if_exists=1)
    
    # Create TSDB table
    setattr(context, 'stocks_tsdb', os.getenv('STOCKS_SENTIMENT_TSDB_TABLE', 'stocks/stocks_sentiment_tsdb'))
    context.v3c.create(backend='tsdb', table=context.stocks_tsdb, rate='1/s', if_exists=1)
    
    # Supply the endpoint provided at the end of execution of 00-deploy-sentiment-model.ipynb.
    setattr(context, 'sentiment_model_endpoint', os.getenv('SENTIMENT_MODEL_ENDPOINT', 'http://nuclio-stocks-sentiment-analysis-serving:8080'))

    sym_to_url={'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc', 'AAPL': 'apple-computer-inc'}
    setattr(context,'sym_to_url', sym_to_url)

In [9]:
def handler(context, handler):
    
    syms = []
    contents = []
    links = []
    times = []
    sentiments = []
    
    for sym, url_string in context.sym_to_url.items():
        context.logger.info(f'Getting news about {sym}')
        news_page = get_stock_news_page(url_string)
        article_links = get_internal_article_links(news_page)
        article_pages = [get_article_page(link) for link in article_links]
        articles = [extract_text(article_page) for article_page in article_pages]
        curr_sentiments = get_article_scores(articles, context.sentiment_model_endpoint)
        curr_times = [get_publish_time(article_page) for article_page in article_pages]
        
        sentiments += curr_sentiments
        times += curr_times
        for article, link, sentiment, time in zip(articles, article_links, curr_sentiments, curr_times):
            record = {
                'content': article,
                'time': time,
                'symbol': sym,
                'link': link,
                'sentiment': sentiment
            }
            context.v3c.execute('stream', context.stocks_stream, 'put', args={'data': json.dumps(record)})
            
            syms.append(sym)
            contents.append(article)
            links.append(link)
                  
    if len(sentiments)>0:
        df = pd.DataFrame.from_dict({'sentiment': sentiments,
                                     'time': times,
                                     'symbol': syms})
        df = df.set_index(['time', 'symbol'])
        df.index = df.index.set_levels([pd.to_datetime(df.index.levels[0]), df.index.levels[1]])
        df = df.sort_index(level=0, axis=0)
        context.logger.debug_with('writing data to TSDB', df=df)
        context.v3c.write(backend='tsdb', table=context.stocks_tsdb, dfs=df)

In [10]:
# nuclio: end-code

## Test locally

In [11]:
init_context(context)

In [12]:
from nuclio import Event
event = Event()

In [13]:
handler(context, event)

Python> 2020-09-23 13:11:11,674 [info] Getting news about GOOGL
Python> 2020-09-23 13:11:24,444 [info] getting score for article 1\8
Python> 2020-09-23 13:11:24,604 [info] getting score for article 2\8
Python> 2020-09-23 13:11:24,762 [info] getting score for article 3\8
Python> 2020-09-23 13:11:25,004 [info] getting score for article 4\8
Python> 2020-09-23 13:11:25,104 [info] getting score for article 5\8
Python> 2020-09-23 13:11:25,221 [info] getting score for article 6\8
Python> 2020-09-23 13:11:25,340 [info] getting score for article 7\8
Python> 2020-09-23 13:11:25,533 [info] getting score for article 8\8
Python> 2020-09-23 13:11:25,697 [info] Getting news about MSFT
Python> 2020-09-23 13:11:31,048 [info] getting score for article 1\5
Python> 2020-09-23 13:11:31,469 [info] getting score for article 2\5
Python> 2020-09-23 13:11:31,630 [info] getting score for article 3\5
Python> 2020-09-23 13:11:31,945 [info] getting score for article 4\5
Python> 2020-09-23 13:11:32,335 [info] gettin

## Deploy to cluster

In [14]:
from mlrun import code_to_function

environment_variables = {'V3IO_CONTAINER': 'bigdata',
                         'STOCKS_STREAM': 'stocks/stocks_stream',
                         'SENTIMENT_MODEL_ENDPOINT': 'http://nuclio-stocks-sentiment-analysis-serving:8080'}

fn = code_to_function('read-news',
                      kind='nuclio',
                      handler='handler')
fn.add_trigger('cron', nuclio.triggers.CronTrigger(interval='300s'))
fn.export('04-read-news.yaml')
fn.set_envs(environment_variables)

> 2020-09-23 13:11:48,414 [info] function spec saved to path: 04-read-news.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f73c49341d0>

In [15]:
fn.deploy(project='stocks')

> 2020-09-23 13:11:49,335 [info] deploy started
[nuclio] 2020-09-23 13:11:56,468 (info) Build complete
[nuclio] 2020-09-23 13:12:02,530 (info) Function deploy complete
[nuclio] 2020-09-23 13:12:02,539 done creating stocks-read-news, function address: 3.12.231.36:30957


'http://3.12.231.36:30957'