# Scrape news and Analyse sentiments
This notebook shows an example of scraping news articles linked to specific traded companies and utilizing our predeployed sentiment analysis model server to predict the sentiment of the author towards said companies.

In [147]:
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio 
import mlrun

## Environment

In [148]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [149]:
%%nuclio cmd -c
pip install beautifulsoup4
pip install pandas
pip install v3io_frames

In [150]:
%%nuclio config 
kind = "nuclio"
spec.build.baseImage = "mlrun/ml-models"

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'


## Function

In [151]:
# nuclio: start-code

In [152]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import requests
import pandas as pd
import v3io_frames as v3f
from unicodedata import normalize
from datetime import datetime
import re
import os
import mlrun.feature_store as fs
import mlrun

In [153]:
def get_stock_news_page(stock_string):
    request = Request('https://www.investing.com/equities/' + stock_string + '-news', headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def get_internal_article_links(page):
    news = page.find_all('div', attrs={'class': 'mediumTitle1'})[1]
    articles = news.find_all('article', attrs={'class': 'js-article-item articleItem'})
    return ['https://www.investing.com' + a.find('a').attrs['href'] for a in articles]

def get_article_page(article_link):
    request = Request(article_link, headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def clean_paragraph(paragraph):
    paragraph = re.sub(r'\(http\S+', '', paragraph)
    paragraph = re.sub(r'\([A-Z]+:[A-Z]+\)', '', paragraph)
    paragraph = re.sub(r'[\n\t\s\']', ' ', paragraph)
    return normalize('NFKD', paragraph)    

def extract_text(article_page):
    text_tag = article_page.find('div', attrs={'class': 'WYSIWYG articlePage'})
    paragraphs = text_tag.find_all('p')
    text = '\n'.join([clean_paragraph(p.get_text()) for p in paragraphs[:-1]])
    return text

import json
def get_publish_time(article):
    tag = article.find('script',{"type" : "application/ld+json"}).contents[0]
    tag_dict = json.loads(str(tag))
    dateModified = tag_dict["dateModified"]
    return datetime.strftime(datetime.strptime(dateModified, '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

def get_score(paragraph_scores):
    return sum([score - 1 for score in paragraph_scores['outputs']]) / len(paragraph_scores)

def get_article_scores(context, articles, endpoint):
    scores = [] 
    for i, article in enumerate(articles):
        context.logger.info(f'getting score for article {i + 1}\\{len(articles)}')
        event_data = {'inputs': article.split('\n')}
        resp = requests.put(endpoint, json=json.dumps(event_data))
        scores.append(get_score(json.loads(resp.text)))
    return scores

def construct_dataframe(sentiments, items,times):
    tickers = [x[0] for x in items]
    stock_sent = pd.DataFrame({"symbol": tickers, "sentiment": sentiments, "last_reaction": times})
    return stock_sent

In [154]:
def init_context(context):
    context.logger.info("init news reader context")
    setattr(context, 'PROJECT_NAME', os.getenv('PROJECT_NAME', 'stocks-' + os.getenv('V3IO_USERNAME')))
    mlrun.set_environment(project = context.PROJECT_NAME)
    
    # Declaring feature set
    stocks_sent_set = fs.FeatureSet("news", entities=[fs.Entity("symbol")])
    setattr(context, 'stock_feature_set', stocks_sent_set)
    
    # Add aggregation 
    context.stock_feature_set.add_aggregation("sentiments","sentiment",["min","max"],["1h"],"10m")
    
    # Initiazling featureset with dummy data that will be overtwritten later on
    news_dummy = pd.DataFrame({"symbol":['GOOGL','MSFT','AMZN','AAPL','INTC'],"sentiment":[0,0,0,0,0],"last_reaction":[0,0,0,0,0]})
    fs.ingest(context.stock_feature_set, news_dummy, infer_options=fs.InferOptions.default())
    
    v3io_framesd = os.getenv('V3IO_FRAMESD', 'framesd:8081')
    token = os.getenv('TOKEN', '')
    client = v3f.Client(v3io_framesd, container=os.getenv('V3IO_CONTAINER', 'users'), token=token)
    setattr(context, 'v3c', client)

    setattr(context, 'stocks_stream', os.getenv('STOCKS_STREAM', os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream'))
    context.v3c.create(backend='stream', table=context.stocks_stream, if_exists=1)

    setattr(context, 'stocks_tsdb', os.getenv('STOCKS_TSDB_TABLE', os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb'))
    context.v3c.create(backend='tsdb', table=context.stocks_tsdb, rate='1/s', if_exists=1)

    setattr(context, 'sentiment_model_endpoint',
            os.getenv('SENTIMENT_MODEL_ENDPOINT', '')) # in the '' should be the model endpoint
    context.logger.info(f"set sentiment_model_endpoint {context.sentiment_model_endpoint}")
    sym_to_url = {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc',
                  'AAPL': 'apple-computer-inc', 'INTC' : 'intel-corp'}
    setattr(context, 'sym_to_url', sym_to_url)
    setattr(context, 'stocks_kv', os.getenv('STOCKS_KV', os.getenv('V3IO_USERNAME') + '/stocks/stocks_kv'))
    context.logger.info('end init context')

In [203]:
def handler(context, event):
    context.logger.info(f'Getting news about {context.sym_to_url}')
    syms = []
    contents = []
    links = []
    times = []
    sentiments = []
    last_ticker_sentiment = []
    last_ticker_time = []
    for sym, url_string in context.sym_to_url.items():
        context.logger.info(f'Getting news about {sym}')
        news_page = get_stock_news_page(url_string)
        article_links = get_internal_article_links(news_page)
        article_pages = [get_article_page(link) for link in article_links]
        articles = [extract_text(article_page) for article_page in article_pages]
        curr_sentiments = get_article_scores(context, articles, context.sentiment_model_endpoint)
        curr_times = [get_publish_time(article_page) for article_page in article_pages]

        sentiments += curr_sentiments
        times += curr_times
        for article, link, sentiment, time in zip(articles, article_links, curr_sentiments, curr_times):
            record = {
                'content': article,
                'time': time,
                'symbol': sym,
                'link': link,
                'sentiment': sentiment
            }
            context.v3c.execute('stream', context.stocks_stream, 'put', args={'data': json.dumps(record)})

            syms.append(sym)
            contents.append(article)
            links.append(link)
        context.v3c.execute('kv', context.stocks_kv, command='update', args={'key': sym,
                                                                             'expression': f"SET sentiment='{sentiments[-1]}';last_reaction='{times[-1]}'"})
        last_ticker_sentiment.append(sentiments[-1])
        last_ticker_time.append(times[-1])
        
    stock_sent = construct_dataframe(last_ticker_sentiment, context.sym_to_url.items(),last_ticker_time)
    context.logger.info(f'Ingesting new information to feature store')
    fs.ingest(context.stock_feature_set, stock_sent, infer_options=fs.InferOptions.default())

    if len(sentiments) > 0:
        df = pd.DataFrame.from_dict({'sentiment': sentiments,
                                     'time': times,
                                     'symbol': syms})
        df = df.set_index(['time', 'symbol'])
        df.index = df.index.set_levels([pd.to_datetime(df.index.levels[0]), df.index.levels[1]])
        df = df.sort_index(level=0, axis=0)
        context.v3c.write(backend='tsdb', table=context.stocks_tsdb, dfs=df)

In [198]:
# def handler(context,event):
#     update_news(context,event)
#     return 'done'

In [199]:
# nuclio: end-code

## Test locally

In [200]:
init_context(context)

Python> 2021-04-05 15:23:06,750 [info] init news reader context
Python> 2021-04-05 15:23:06,757 [info] set sentiment_model_endpoint http://default-tenant.app.dev8.lab.iguazeng.com:32556
Python> 2021-04-05 15:23:06,758 [info] end init context


In [201]:
from nuclio import Event
event = Event()

In [202]:
handler(context, event)

Python> 2021-04-05 15:23:07,950 [info] Getting news about {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc', 'AAPL': 'apple-computer-inc', 'INTC': 'intel-corp'}
Python> 2021-04-05 15:23:07,951 [info] Getting news about GOOGL
Python> 2021-04-05 15:23:21,318 [info] getting score for article 1\5
Python> 2021-04-05 15:23:21,947 [info] getting score for article 2\5
Python> 2021-04-05 15:23:22,533 [info] getting score for article 3\5
Python> 2021-04-05 15:23:22,738 [info] getting score for article 4\5
Python> 2021-04-05 15:23:23,448 [info] getting score for article 5\5
Python> 2021-04-05 15:23:23,862 [info] Getting news about MSFT
Python> 2021-04-05 15:23:40,439 [info] getting score for article 1\7
Python> 2021-04-05 15:23:41,293 [info] getting score for article 2\7
Python> 2021-04-05 15:23:41,642 [info] getting score for article 3\7
Python> 2021-04-05 15:23:42,000 [info] getting score for article 4\7
Python> 2021-04-05 15:23:42,489 [info] getting score for article 5

## Deploy to cluster

In [123]:
from mlrun import code_to_function
project_name = "stocks-" + os.getenv('V3IO_USERNAME')
# Export bare function
fn = code_to_function('read-news',
                      handler='handler')
fn.export('02-read-news.yaml')

# Set parameters for current deployment
fn.add_trigger('cron', nuclio.triggers.CronTrigger('10s'))
fn.set_envs({'V3IO_CONTAINER': 'users',
             'STOCKS_STREAM': os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream',
             'STOCKS_TSDB_TABLE': os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb',
             'SENTIMENT_MODEL_ENDPOINT': 'http://default-tenant.app.dev8.lab.iguazeng.com:31772',
             'PROJECT_NAME' : project_name})
fn.spec.max_replicas = 1

> 2021-03-25 11:32:58,659 [info] function spec saved to path: 02-read-news.yaml


In [124]:
addr = fn.deploy(project=project_name)

> 2021-03-25 11:32:58,666 [info] Starting remote function deploy
2021-03-25 11:32:58  (info) Deploying function
2021-03-25 11:32:58  (info) Building
2021-03-25 11:32:58  (info) Staging files and preparing base images
2021-03-25 11:32:58  (info) Building processor image
2021-03-25 11:33:00  (info) Build complete
2021-03-25 11:33:06  (info) Function deploy complete
> 2021-03-25 11:33:06,437 [info] function deployed, address=default-tenant.app.dev8.lab.iguazeng.com:30566


In [125]:
!curl {addr}