# Scrape news and Analyse sentiments
This notebook shows an example of scraping news articles linked to specific traded companies and utilizing our predeployed sentiment analysis model server to predict the sentiment of the author towards said companies.

In [17]:
# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter
import nuclio 
import mlrun

## Environment

In [18]:
%nuclio env -c V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}
%nuclio env -c V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env -c V3IO_API=${V3IO_API}

In [19]:
%%nuclio cmd -c
pip install beautifulsoup4
pip install pandas
pip install v3io_frames

In [20]:
%%nuclio config 
kind = "nuclio"
spec.build.baseImage = "mlrun/ml-models"

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'


## Function

In [21]:
# nuclio: start-code

In [22]:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import requests
import pandas as pd
import v3io_frames as v3f
from unicodedata import normalize
from datetime import datetime
import re
import os
import mlrun.feature_store as fs
import mlrun

In [23]:
def get_stock_news_page(stock_string):
    request = Request('https://www.investing.com/equities/' + stock_string + '-news', headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def get_internal_article_links(page):
    news = page.find_all('div', attrs={'class': 'mediumTitle1'})[1]
    articles = news.find_all('article', attrs={'class': 'js-article-item articleItem'})
    return ['https://www.investing.com' + a.find('a').attrs['href'] for a in articles]

def get_article_page(article_link):
    request = Request(article_link, headers={"User-Agent": "Mozilla/5.0"})
    content = urlopen(request).read()
    return bs(content, 'html.parser')

def clean_paragraph(paragraph):
    paragraph = re.sub(r'\(http\S+', '', paragraph)
    paragraph = re.sub(r'\([A-Z]+:[A-Z]+\)', '', paragraph)
    paragraph = re.sub(r'[\n\t\s\']', ' ', paragraph)
    return normalize('NFKD', paragraph)    

def extract_text(article_page):
    text_tag = article_page.find('div', attrs={'class': 'WYSIWYG articlePage'})
    paragraphs = text_tag.find_all('p')
    text = '\n'.join([clean_paragraph(p.get_text()) for p in paragraphs[:-1]])
    return text

# this function isnt up-to-date with the structure of the website
# def get_publish_time(article_page):
#     details = article_page.find('meta', attrs={'itemprop': 'dateModified'})
#     publish_date = details.get_attribute_list('content')[0]
#     return str(datetime.strptime(publish_date, '%Y-%m-%d %H:%M:%S'))

import json
def get_publish_time(article):
    tag = article.find('script',{"type" : "application/ld+json"}).contents[0]
    tag_dict = json.loads(str(tag))
    dateModified = tag_dict["dateModified"]
    return datetime.strftime(datetime.strptime(dateModified, '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')

def get_score(paragraph_scores):
    return sum([score - 1 for score in paragraph_scores['outputs']]) / len(paragraph_scores)

def get_article_scores(context, articles, endpoint):
    scores = [] 
    for i, article in enumerate(articles):
        context.logger.info(f'getting score for article {i + 1}\\{len(articles)}')
        event_data = {'inputs': article.split('\n')}
        resp = requests.put(endpoint, json=json.dumps(event_data))
        scores.append(get_score(json.loads(resp.text)))
    return scores

def construct_dataframe(sentiments, items):
    tickers = [x[0] for x in items]
    stock_sent = pd.DataFrame({"symbol": tickers, "sentiment": sentiments})
    return stock_sent

In [24]:
def init_context(context):
    context.logger.info("init news reader context")
    setattr(context, 'PROJECT_NAME', os.getenv('PROJECT_NAME', 'stocks-test-' + os.getenv('V3IO_USERNAME')))
    mlrun.set_environment(project = context.PROJECT_NAME)
    # Declaring feature set
    stocks_sent_set = fs.FeatureSet("news", entities=[fs.Entity("symbol")])
    setattr(context, 'stock_feature_set', stocks_sent_set)
    # Add aggregation 
    context.stock_feature_set.add_aggregation("sentiments","sentiment",["min","max"],["1h"],"10m")
    
    v3io_framesd = os.getenv('V3IO_FRAMESD', 'framesd:8081')
    token = os.getenv('TOKEN', '')
    client = v3f.Client(v3io_framesd, container=os.getenv('V3IO_CONTAINER', 'users'), token=token)
    setattr(context, 'v3c', client)

    setattr(context, 'stocks_stream', os.getenv('STOCKS_STREAM', os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream'))
    context.v3c.create(backend='stream', table=context.stocks_stream, if_exists=1)

    setattr(context, 'stocks_tsdb', os.getenv('STOCKS_TSDB_TABLE', os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb'))
    context.v3c.create(backend='tsdb', table=context.stocks_tsdb, rate='1/s', if_exists=1)

    setattr(context, 'sentiment_model_endpoint',
            os.getenv('SENTIMENT_MODEL_ENDPOINT', 'http://stocks-test-dani-sentiment-analysis-serving-stocks-test-dani.default-tenant.app.dev8.lab.iguazeng.com'))
    context.logger.info(f"set sentiment_model_endpoint {context.sentiment_model_endpoint}")
    sym_to_url = {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc',
                  'AAPL': 'apple-computer-inc'}
    setattr(context, 'sym_to_url', sym_to_url)
    setattr(context, 'stocks_kv', os.getenv('STOCKS_KV', 'stocks/stocks_kv'))
    context.logger.info('end init context')

In [28]:
def handler(context, event):
    context.logger.info(f'Getting news about {context.sym_to_url}')

    syms = []
    contents = []
    links = []
    times = []
    sentiments = []
    last_ticker_sentiment = []
    for sym, url_string in context.sym_to_url.items():
        context.logger.info(f'Getting news about {sym}')
        news_page = get_stock_news_page(url_string)
        article_links = get_internal_article_links(news_page)
        article_pages = [get_article_page(link) for link in article_links]
        articles = [extract_text(article_page) for article_page in article_pages]
        curr_sentiments = get_article_scores(context, articles, context.sentiment_model_endpoint)
        curr_times = [get_publish_time(article_page) for article_page in article_pages]

        sentiments += curr_sentiments
        times += curr_times
        for article, link, sentiment, time in zip(articles, article_links, curr_sentiments, curr_times):
            record = {
                'content': article,
                'time': time,
                'symbol': sym,
                'link': link,
                'sentiment': sentiment
            }
            context.v3c.execute('stream', context.stocks_stream, 'put', args={'data': json.dumps(record)})

            syms.append(sym)
            contents.append(article)
            links.append(link)
        context.v3c.execute('kv', context.stocks_kv, command='update', args={'key': sym,
                                                                             'expression': f'SET sentiment={sentiments[-1]}'})
        last_ticker_sentiment.append(sentiments[-1])
    stock_sent = construct_dataframe(last_ticker_sentiment, context.sym_to_url.items())
    context.logger.info(f'Ingesting new information to feature store')
    fs.ingest(context.stock_feature_set, stock_sent, infer_options=fs.InferOptions.default())

    if len(sentiments) > 0:
        df = pd.DataFrame.from_dict({'sentiment': sentiments,
                                     'time': times,
                                     'symbol': syms})
        df = df.set_index(['time', 'symbol'])
        df.index = df.index.set_levels([pd.to_datetime(df.index.levels[0]), df.index.levels[1]])
        df = df.sort_index(level=0, axis=0)
        context.v3c.write(backend='tsdb', table=context.stocks_tsdb, dfs=df)

In [26]:
# nuclio: end-code

## Test locally

In [29]:
init_context(context)

Python> 2021-03-22 09:28:57,133 [info] init news reader context
Python> 2021-03-22 09:28:57,139 [info] set sentiment_model_endpoint http://stocks-test-dani-sentiment-analysis-serving-stocks-test-dani.default-tenant.app.dev8.lab.iguazeng.com
Python> 2021-03-22 09:28:57,140 [info] end init context


In [30]:
from nuclio import Event
event = Event()

In [31]:
handler(context, event)

Python> 2021-03-22 09:28:57,644 [info] Getting news about {'GOOGL': 'google-inc', 'MSFT': 'microsoft-corp', 'AMZN': 'amazon-com-inc', 'AAPL': 'apple-computer-inc'}
Python> 2021-03-22 09:28:57,645 [info] Getting news about GOOGL
Python> 2021-03-22 09:29:21,325 [info] getting score for article 1\10
Python> 2021-03-22 09:29:21,624 [info] getting score for article 2\10
Python> 2021-03-22 09:29:22,828 [info] getting score for article 3\10
Python> 2021-03-22 09:29:23,088 [info] getting score for article 4\10
Python> 2021-03-22 09:29:23,515 [info] getting score for article 5\10
Python> 2021-03-22 09:29:23,644 [info] getting score for article 6\10
Python> 2021-03-22 09:29:23,913 [info] getting score for article 7\10
Python> 2021-03-22 09:29:25,478 [info] getting score for article 8\10
Python> 2021-03-22 09:29:26,098 [info] getting score for article 9\10
Python> 2021-03-22 09:29:26,256 [info] getting score for article 10\10
Python> 2021-03-22 09:29:26,614 [info] Getting news about MSFT
Python> 

## Deploy to cluster

In [14]:
from mlrun import code_to_function

# Export bare function
fn = code_to_function('read-news',
                      handler='handler')
fn.export('02-read-news.yaml')

# Set parameters for current deployment
fn.add_trigger('cron', nuclio.triggers.CronTrigger('10s'))
fn.set_envs({'V3IO_CONTAINER': 'users',
             'STOCKS_STREAM': os.getenv('V3IO_USERNAME') + '/stocks/stocks_stream',
             'STOCKS_TSDB_TABLE': os.getenv('V3IO_USERNAME') + '/stocks/stocks_tsdb',
             'SENTIMENT_MODEL_ENDPOINT': 'http://nuclio-stocks-sentiment-analysis-serving:8080',
             'PROJECT_NAME' : "stocks-avia"})
fn.spec.max_replicas = 1

> 2020-10-14 07:00:08,593 [info] function spec saved to path: 02-read-news.yaml


In [15]:
fn.deploy(project='stocks')

> 2020-10-14 07:00:08,600 [info] deploy started
[nuclio] 2020-10-14 07:01:08,879 (info) Build complete
[nuclio] 2020-10-14 07:01:17,040 (info) Function deploy complete
[nuclio] 2020-10-14 07:01:17,048 done updating stocks-read-news, function address: 192.168.224.209:30776


'http://192.168.224.209:30776'