<a href="https://www.kaggle.com/code/joshuaokolo/nlp-bi-web-scraper?scriptVersionId=104037536" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd

# Let's pick a company ticker symbol (AMZN for Amazon)
company_ticker = 'AMZN'
# Add the ticker symbol to the "finviz" search box url
url = ("http://finviz.com/quote.ashx?t=" + company_ticker.lower())
# Most websites block requests that are without a User-Agent header (these simulate a typical browser)

# Send a Request to the url and return an html file
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

# open and read the request
webpage = urlopen(req).read()

# make a soup using BeautifulSoup from webpage
html = soup(webpage, "html.parser")

# Extract the 'class' = 'fullview-news-outer' from our html code, and create a dataframe from it
news = pd.read_html(str(html), attrs={'class': 'fullview-news-outer'})[0]

# extract the links for each news by finding all the "a" tags and 'class' = 'tab-link-news'
links = []
for a in html.find_all('a', class_="tab-link-news"):
links.append(a['href'])

# Clean up our news dataframe
news.columns = ['Date', 'News_Headline']
news['Article_Link'] = links
news.head()

In [None]:
import re

# extract time as a new column
news['time'] = news['Date'].apply(lambda x: ''.join(re.findall(r'[a-zA-Z]{1,9}-\d{1,2}-\d{1,2}\s(.+)', x)))

# fill empty cells by the times mentioned in the "Date" column
news.loc[news['time'] == '', 'time'] = news['Date']

news

In [None]:
import numpy as np

news['date'] = news['Date'].apply(lambda x: ''.join(re.findall(r'([a-zA-Z]{1,9}-\d{1,2}-\d{1,2})\s.+', x)))

# change empty cells to NaN type in the new "date" column
news.loc[news['date'] == '', 'date'] = np.nan

# fillna() by forward filling
news.fillna(method = 'ffill', inplace = True)

news

In [None]:
# combine "date" & "time" columns and convert to datetime type
news['datetime'] = pd.to_datetime(news['date'] + ' ' + news['time'])

# clean out dataframe
news.drop(['Date', 'time', 'date'], axis = 1, inplace = True)
news.sort_values('datetime', inplace = True)
news.reset_index(drop=True, inplace =True)
news.columns = ['news_headline', 'url', 'datetime']

News

In [None]:
from newsapi.newsapi_client import NewsApiClient

company_ticker = 'AMZN'
search_date = '2022-04-01'

newsapi = NewsApiClient(api_key='3a2d0a55066041dc81e3acfbd665fc6e')
# extract "articles", which will be a dictionary
articles = newsapi.get_everything(q=company_ticker,
                              from_param=search_date,
                              language="en",
                              sort_by="publishedAt",
                              page_size=100)

# we want to get the "articles" key from our "articles" dictionary
df_newsapi = pd.DataFrame(articles['articles'])
df_newsapi.head()

In [None]:
# do some cleaning of the df_newsapi
df_newsapi.drop(['author', 'urlToImage'], axis=1, inplace=True)
df_newsapi.rename({'publishedAt': 'datetime'}, axis=1, inplace = True)
df_newsapi.rename({'title': 'news_headline'}, axis=1, inplace = True)
df_newsapi['source'] =  df_newsapi['source'].map(lambda x: x['name'])
df_newsapi.head()

In [None]:
from GoogleNews import GoogleNews
from newspaper import Config
import re

company_ticker = 'AMZN'
search_date = '2022-04-02'

# GoogleNews sometime returns an empty dataframe, so we add a try and except Block for handling those exceptions
try:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 10
df_google = pd.DataFrame()

# change the format of date string from YYYY-MM-DD to MM/DD/YYYY so that is works with GoogleNews
start_date = re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\2/\\3/\\1', search_date)

# Extract News with Google News ---> gives only 10 results per request
googlenews = GoogleNews(start=start_date)
googlenews.search(company_ticker)

# store the results of the first result page
result1 = googlenews.result()
df_google1 = pd.DataFrame(result1)

# store the results of the 2nd result page
googlenews.clear()
googlenews.getpage(2)
result2 = googlenews.result()
df_google2 = pd.DataFrame(result2)

df_google = pd.concat([df_google1, df_google2])

# do some cleaning of the df_google DF
if df_google.shape[0] != 0:
    df_google.drop(['img', 'date'], axis=1, inplace=True)
    df_google.columns = ['news_headline', 'source', 'datetime', 'description', 'url']
display(df_google.head())
except:
pass

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
import re
import numpy as np
from newsapi.newsapi_client import NewsApiClient

def get_news(company_ticker, search_date):
    ## newsapi
    newsapi = NewsApiClient(api_key='3a2d0a55066041dc81e3acfbd665fc6e')
    articles = newsapi.get_everything(q=company_ticker,      
                                   from_param=search_date,
                                   language="en",
                                   sort_by="publishedAt",
                                   page_size=100)
    df_newsapi = pd.DataFrame(articles['articles'])
    # do some cleaning of the DF
    df_newsapi.drop(['author', 'urlToImage'], axis=1, inplace=True)
    df_newsapi.rename({'publishedAt': 'datetime'}, axis=1, inplace = True)
    df_newsapi.rename({'title': 'news_headline'}, axis=1, inplace = True)
    df_newsapi['source'] =  df_newsapi['source'].map(lambda x: x['name'])  
    
    ## finviz
    url = ("http://finviz.com/quote.ashx?t=" + company_ticker.lower())
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    html = soup(webpage, "html.parser")
    news = pd.read_html(str(html), attrs={'class': 'fullview-news-outer'})[0]

    links = []
    for a in html.find_all('a', class_="tab-link-news"):
        links.append(a['href'])
    # Clean up news dataframe
    news.columns = ['Date', 'News_Headline']
    news['Article_Link'] = links

     # >>> clean "Date" column and create a new "datetime" column
     # extract time
    news['time'] = news['Date'].apply(lambda x: ''.join(re.findall(r'[a-zA-Z]{1,9}-\d{1,2}-\d{1,2}\s(.+)', x)))
    news.loc[news['time'] == '', 'time'] = news['Date']
    #extract date
    news['date'] = news['Date'].apply(lambda x: ''.join(re.findall(r'([a-zA-Z]{1,9}-\d{1,2}-\d{1,2})\s.+', x)))
    news.loc[news['date'] == '', 'date'] = np.nan
    news.fillna(method = 'ffill', inplace = True)
    # convert to datetime type
    news['datetime'] = pd.to_datetime(news['date'] + ' ' + news['time'])
    news.drop(['Date', 'time', 'date'], axis = 1, inplace = True)
    news.sort_values('datetime', inplace = True)
    news.reset_index(drop=True, inplace =True)
    news.columns = ['news_headline', 'url', 'datetime']
    df_finviz = news.copy()## GoogleNews# GoogleNews sometime returns an empty dataframe, so we add a try and except Block for handling those exceptions
try:
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 10
    df_google = pd.DataFrame()

    # change the format of date string from YYYY-MM-DD to MM/DD/YYYY so that is works with GoogleNews
    start_date = re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\2/\\3/\\1', search_date)

    # Extract News with Google News ---> gives only 10 results per request
    googlenews = GoogleNews(start=start_date)
    googlenews.search(company_ticker)
    
    # store the results of the first result page
    result1 = googlenews.result()
    df_google1 = pd.DataFrame(result1)
    
    # store the results of the 2nd result page
    googlenews.clear()
    googlenews.getpage(2)
    result2 = googlenews.result()
    df_google2 = pd.DataFrame(result2)
    
    df_google = pd.concat([df_google1, df_google2])
    # do some cleaning of the df_google DF
    if df_google.shape[0] != 0:
        df_google.drop(['img', 'date'], axis=1, inplace=True)
        df_google.columns = ['news_headline', 'source', 'datetime', 'description', 'url']
except:
    pass
    ## Add the 3 DFs together
     df_news = pd.concat([df_newsapi, df_finviz, df_google], ignore_index=True)
    df_news['datetime'] = pd.to_datetime(df_news['datetime'], format = '%Y-%m-%d %H:%M:%S')
    df_news.set_index('datetime', inplace = True)
    # only returning the rows that match our search_date
    df_news = df_news[df_news.index.to_period('D') == search_date]
    df_news.sort_index(inplace = True)
    # Get clean source column from urls using regex
    df_news['source'] = df_news['url'].map(lambda x: ''.join(re.findall(r"https?://(?:www.)?([A-Za-z_0-9.-]+).*", x)))
    
    return df_news

In [None]:
df_news = get_news('AMZN', '2022-04-01')

df_news.shape

>>> (68, 5)

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
# download these 3 when run for the first time
nltk.download('vader_lexicon')
nltk.download('movie_reviews')
nltk.download('punkt')

def nltk_vader_score(text):
sentiment_analyzer = SIA()
# we take "compound score" (from -1 to 1): The normalized compound score which calculates the sum of all lexicon ratings
sent_score = sentiment_analyzer.polarity_scores(text)['compound']
return sent_score

df_news['sentiment_score_vader'] = df_news['news_headline'].map(nltk_vader_score)
df_news.head()

In [None]:
import plotly.express as px
fig = px.histogram(
df_news, x='sentiment_score_vader',
color='source').update_xaxes(categoryorder="total descending")

fig.update_layout(xaxis_title='Sentiment Score (Compound from -1 to 1)',
              yaxis_title='Count',
              font=dict(size=16),
              bargap=0.025,
              width=790,
              height=520,
              legend=dict(orientation="h",
                          yanchor="top",
                          y=1.23,
                          xanchor="center",
                          x=0.48))
fig.show('notebook')

In [None]:
def sentiment_type(text):
analyzer = SIA().polarity_scores(text)
neg = analyzer['neg']
neu = analyzer['neu']
pos = analyzer['pos']
comp = analyzer['compound']
   
if neg > pos:
    return 'negative'
elif pos > neg:
    return 'positive'
elif pos == neg:
    return 'neutral'df_news['sentiment_type'] = df_news['news_headline'].map(sentiment_type)

In [None]:
fig = px.pie(df_news,
        values=df_news['sentiment_type'].value_counts(normalize=True) * 100,
        names=df_news['sentiment_type'].unique(),
        color=df_news['sentiment_type'].unique(),
        hole=0.35,
        color_discrete_map={
            'neutral': 'silver',
            'positive': 'mediumspringgreen',
            'negative': 'orangered'
        })fig.update_traces(textposition='inside', textinfo='percent+label', textfont_size=22, hoverinfo='label+value',
              texttemplate = "%{label}<br>%{value:.0f}%")fig.update_layout(font=dict(size=16),
              width=810,
              height=520)
fig.show('notebook')

In [None]:
from wordcloud import WordCloud, STOPWORDS

def word_cloud(text):
stopwords = set(STOPWORDS)
allWords = ' '.join([nws for nws in text])
wordCloud = WordCloud(
    background_color='white',  # black
    width=1600,
    height=800,
    stopwords=stopwords,
    min_font_size=20,
    max_font_size=150).generate(allWords)
fig, ax = plt.subplots(figsize=(20, 10),
                      facecolor='w')  # facecolor='k' for black frame
plt.imshow(wordCloud, interpolation='bilinear')
ax.axis("off")
fig.tight_layout(pad=0)
plt.show()

print('Wordcloud for ' + company_ticker)
word_cloud(df_news['news_headline_tokens'].values)

In [None]:
soup = BeautifulSoup(html_text, 'lxml')
tag = soup.body

In [None]:
def get_article_text(Article_Link):
import requests
from bs4 import BeautifulSoup

# using request package to make a GET request for the website, which means we're getting data from it.
header = {
    "User-Agent":
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

html = requests.get(Article_Link, headers=header).content
soup = BeautifulSoup(html)

# Get the whole body tag
tag = soup.body

# Join each string recursively
text = []
for string in tag.strings:
    # ignore if fewer than 15 words
    if len(string.split()) > 15:
        text.append(string)
return ' '.join(text)

In [None]:
df_news['news_text'] = df_news['url'].map(get_article_text)

# cleaning news_text by transforming anything that is NOT space, letters, or numbers to ''
df_news['news_text'] = df_news['news_text'].apply(lambda x: re.sub('[^ a-zA-Z0-9]', '', x))

In [None]:
def keyword_extractor(text):
from flashtext import KeywordProcessor
kwp = KeywordProcessor()

keyword_dict = {
    'new product': ['new product', 'new products'],
    'M&A': ['merger', 'acquisition'],
    'stock split/buyback': ['buyback', 'split'],
    'workforce change': ['hire', 'hiring', 'firing', 'lay off', 'laid off']
}

kwp.add_keywords_from_dict(keyword_dict)
   
# we use set to get rid of repeating keywords, and ', '.join() to get string instead of SET data type:
return ', '.join(set(kwp.extract_keywords(text)))

In [None]:
df_news['event_keywords'] = df_news['news_text'].map(keyword_extractor)

In [None]:
fig = px.histogram(
    df_news[df_news['event_keywords'] != ''],
    x='event_keywords',
    color='sentiment_type',
    color_discrete_map={
                        'neutral': 'silver',
                        'positive': 'mediumspringgreen',
                        'negative': 'orangered'
                    }).update_xaxes(categoryorder="total descending")

fig.update_layout(yaxis_title='Count',
              xaxis_title='',
              width=810,
              height=620,
              font=dict(size=16),
              legend=dict(orientation="h",
                          yanchor="top",
                          y=1.16,
                          xanchor="center",
                          x=0.5))
fig.update_xaxes(tickangle=-45)

**References**:

https://towardsdatascience.com/the-best-python-sentiment-analysis-package-1-huge-common-mistake-d6da9ad6cdeb

https://pythoninvest.com/long-read/sentiment-analysis-of-financial-news

https://www.kaggle.com/mmmarchetti/sentiment-analysis-on-financial-news

https://medium.datadriveninvestor.com/scraping-live-stock-fundamental-ratios-news-and-more-with-python-a716329e0493

https://omdena.com/blog/business-intelligence-tool-for-financial/

https://pypi.org/project/finpie/

https://tradewithpython.com/news-sentiment-analysis-using-pytho