In [1]:
import os

from IPython.display import display, Markdown
from pprint import pprint
import pandas as pd
from fuzzywuzzy import fuzz

from src.scraping import extract_cbc_article_info
from src.nlp import nlp_analysis
from src.urls import cbc_urls

In [2]:
cbc_urls

['https://www.cbc.ca/news/business/starbucks-greener-cup-1.5063861',
 'https://www.cbc.ca/news/business/rogers-media-magazines-1.5064054',
 'https://www.cbc.ca/news/business/budget-cmhc-home-buyers-1.5063204',
 'https://www.cbc.ca/news/business/eu-regulators-fine-google-online-ads-1.5063806',
 'https://www.cbc.ca/news/canada/nova-scotia/air-canada-max-8s-grounded-july-1-1.5062354',
 'https://www.cbc.ca/news/business/shoppers-drug-mart-superstore-self-checkout-loblaw-1.5056800',
 'https://www.cbc.ca/news/business/volkswagen-charged-with-defrauding-investors-1.5058925',
 'https://www.cbc.ca/news/technology/facebook-instagram-outage-cause-1.5056807']

### Scrape CBC Website for Articles and Perform NLP

In [None]:
articles = []
for article_id, article_url in enumerate(cbc_urls):
    article_text = extract_cbc_article_info(article_url)
    article_nlp = nlp_analysis(article_text)

    articles.append({
        'article_id': article_id,
        **article_text, 
        **article_nlp, 
    })


In [None]:
pprint(articles)

### Correlate with IA Holdings

In [None]:
investment_advisor_holdings = pd.read_csv(
    os.path.join('data', 'internal', 'investment_advisor_holdings.csv'))

In [None]:
investment_advisor_holdings

In [None]:
IA_articles = []

investment_advisors = investment_advisor_holdings['investment_advisor'].unique()
for ia in investment_advisors:
    articles_matched = []
    
    ia_holdings = investment_advisor_holdings[
        investment_advisor_holdings['investment_advisor']==ia]

    for article in articles:
        article_match = False
        article_id = article['article_id']
        organizations = [orgs[0] for orgs in article['organizations']]
        for org in organizations:
            for security_name in ia_holdings['security_name'].unique():
                org_sim = fuzz.token_set_ratio(org, security_name)
                if org_sim >= 80:
                    articles_matched.append(article_id)
                    article_match = True
                    break
            if article_match:
                break
    IA_articles.append({
        'investment_advisor': ia,
        'matched_articles': articles_matched
    })

In [None]:
IA_articles

In [None]:
for ia_article in IA_articles:
    ia_name = ia_article['investment_advisor']
    ia_holdings = investment_advisor_holdings[
        investment_advisor_holdings['investment_advisor']==ia_name]
    
    display(Markdown(f"# Daily recommended articles for {ia_name}"))
    display(Markdown(f'**Holdings**: *{", ".join(ia_holdings["security_name"].tolist())}*'))

    matched_articles = ia_article['matched_articles']
    for article in articles:
        if article['article_id'] in matched_articles:
            orgs = [org for org, rank in article['organizations']]
            display(Markdown(f"### { article['title']}"))
            display(Markdown(f"**Summary**: {' '.join(article['pagerank_summary'])}"))
            display(Markdown(f"**Organizations**: {', '.join(orgs)}"))
            display(Markdown(f"**Read more**: {article['url']}"))
  