In [None]:
import os

from IPython.display import display, Markdown
import pandas as pd
from fuzzywuzzy import fuzz
import spacy
from pprint import pprint

from src.scraping import extract_cnbc_article_info
from src.nlp import nlp_analysis
from src.urls import article_urls

In [None]:
article_urls

### Scrape CNBC Website for Articles

In [None]:
articles = []
for article_id, article_url in enumerate(article_urls):
    article_text = extract_cnbc_article_info(article_url)
    article_nlp = nlp_analysis(article_text)

    articles.append({
        'article_id': article_id,
        **article_text, 
        **article_nlp, 
    })


In [None]:
pprint(articles)

### Correlate with IA Holdings

In [None]:
investment_advisor_holdings = pd.read_csv(
    os.path.join('data', 'internal', 'investment_advisor_holdings.csv'))

In [None]:
investment_advisor_holdings

In [None]:
IA_articles = []

investment_advisors = investment_advisor_holdings['investment_advisor'].unique()
for ia in investment_advisors:
    articles_matched = []
    
    ia_holdings = investment_advisor_holdings[
        investment_advisor_holdings['investment_advisor']==ia]

    for article in articles:
        article_match = False
        article_id = article['article_id']
        organizations = [orgs[0] for orgs in article['organizations']]
        for org in organizations:
            for security_name in ia_holdings['security_name'].unique():
                org_sim = fuzz.token_set_ratio(org, security_name)
                if org_sim > 80:
                    articles_matched.append(article_id)
                    article_match = True
                    break
            if article_match:
                break
    IA_articles.append({
        'investment_advisor': ia,
        'matched_articles': articles_matched
    })

In [None]:
IA_articles

In [None]:
for ia_article in IA_articles:
    display(Markdown(f"# Daily recommended articles for {ia_article['investment_advisor']}"))
    matched_articles = ia_article['matched_articles']
    for article in articles:
        if article['article_id'] in matched_articles:
            display(Markdown(f"### { article['title']}"))
            display(Markdown(f"Summary: {' '.join(article['pagerank_summary'])}"))
            display(Markdown(f"Read more: {article['url']}"))
  