In [78]:
import requests
from bs4 import BeautifulSoup as bs
import json
from string import punctuation
from get_all_tickers import get_tickers as gt
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.io as pio

from plotly.offline import download_plotlyjs, init_notebook_mode
from plotly.offline import plot, iplot

pio.templates.default = 'plotly_dark'
init_notebook_mode(connected=True)
plt.style.use(['dark_background'])

## Scraping Yahoo Finance

In [76]:
# Top 50 Stock Tickers
tickers = gt.get_biggest_n_tickers(50)

def get_data(ticker):
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:82.0) Gecko/20100101 Firefox/82.0'}
    
    url = f'https://finance.yahoo.com/quote/{ticker}/'
    r = requests.get(url, headers=headers)
    soup = bs(r.text, 'html.parser')

    stock_stats = soup.find_all('td', {'class': 'Ta(end) Fw(600) Lh(14px)'})
    stocks = {
    'ticker' : soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text.split('(')[-1].strip(punctuation),
    'closing' : stock_stats[0].text,
    'opening' : stock_stats[1].text,
    'days_range' : stock_stats[4].text,
    'volume' : stock_stats[6].text,
    'fair_value' : soup.find('div', {'class': 'Fw(b) Fl(end)--m Fz(s) C($primaryColor'}).text
    }
    return stocks

stock_data = []
for idx, ticker in enumerate(tickers):
    stock_data.append(get_data(ticker))
    time.sleep(2)
    print(f'Finished with number {idx} -- {ticker}')

Finished with number 0 -- AAPL
Finished with number 1 -- MSFT
Finished with number 2 -- AMZN
Finished with number 3 -- GOOG
Finished with number 4 -- GOOGL
Finished with number 5 -- FB
Finished with number 6 -- BABA
Finished with number 7 -- TSM
Finished with number 8 -- V
Finished with number 9 -- WMT
Finished with number 10 -- JNJ
Finished with number 11 -- TSLA
Finished with number 12 -- PG
Finished with number 13 -- JPM
Finished with number 14 -- UNH
Finished with number 15 -- MA
Finished with number 16 -- NVDA
Finished with number 17 -- HD
Finished with number 18 -- VZ
Finished with number 19 -- DIS
Finished with number 20 -- BAC
Finished with number 21 -- KO
Finished with number 22 -- CRM
Finished with number 23 -- ADBE
Finished with number 24 -- CMCSA
Finished with number 25 -- PYPL
Finished with number 26 -- PFE
Finished with number 27 -- NFLX
Finished with number 28 -- T
Finished with number 29 -- MRK
Finished with number 30 -- NKE
Finished with number 31 -- PEP
Finished with 

In [77]:
with open('11-15-yahoo-stock.json', 'w') as f:
    json.dump(stock_data, f)

In [89]:
yahoo = pd.DataFrame(stock_data)
# yahoo.to_csv('11-15-yahoo-stock.csv', index=False, header=df.columns.values)

## Scraping Tingo

In [83]:
# I'm using Docker to access this website
local = 'http://localhost:8050/render.html'

news_list = []

for idx, ticker in enumerate(tickers):
    url = f'https://www.tiingo.com/{ticker}/overview'
    r = requests.get(local, params={'url': url, 'wait': 2})
    soup = bs(r.content, 'html.parser')
    
    # News article headlines
    headlines = soup.find_all('div', class_='headline')
    headline_list = [headline.find('a').text for headline in headlines]

    # News article content that is previewed
    articles = soup.find_all('div', class_='lede')
    article_list = [article.text for article in articles]

    dates = soup.find_all('div', class_='date-source')
    date_list = [date.text.split('|')[0] for date in dates]

    sources = soup.find_all('div', class_='date-source')
    source_list = [source.text.split('|')[1] for source in sources]

    news = {
        'ticker': ticker,
        'headline': headline_list,
        'article': article_list,
        'date': date_list,
        'source': source_list,
    }
    
    temp_df = pd.DataFrame(news)
    news_list.append(temp_df)
    
    time.sleep(2)
    print(f'Finished number {idx} -- {ticker}')

Finished number 0 -- AAPL
Finished number 1 -- MSFT
Finished number 2 -- AMZN
Finished number 3 -- GOOG
Finished number 4 -- GOOGL
Finished number 5 -- FB
Finished number 6 -- BABA
Finished number 7 -- TSM
Finished number 8 -- V
Finished number 9 -- WMT
Finished number 10 -- JNJ
Finished number 11 -- TSLA
Finished number 12 -- PG
Finished number 13 -- JPM
Finished number 14 -- UNH
Finished number 15 -- MA
Finished number 16 -- NVDA
Finished number 17 -- HD
Finished number 18 -- VZ
Finished number 19 -- DIS
Finished number 20 -- BAC
Finished number 21 -- KO
Finished number 22 -- CRM
Finished number 23 -- ADBE
Finished number 24 -- CMCSA
Finished number 25 -- PYPL
Finished number 26 -- PFE
Finished number 27 -- NFLX
Finished number 28 -- T
Finished number 29 -- MRK
Finished number 30 -- NKE
Finished number 31 -- PEP
Finished number 32 -- ABT
Finished number 33 -- TM
Finished number 34 -- NVS
Finished number 35 -- TMO
Finished number 36 -- INTC
Finished number 37 -- PDD
Finished number 38

In [195]:
# Combine the list of dataframes
news_df = pd.concat(news_list)
# Index did not line up, so reset it
news_df.reset_index(inplace=True, drop=True)
# Rename column because I'm going to use date for something else
news_df.rename(columns={'date': 'date_time'}, inplace=True)
# news_df.to_csv('11-15-tingo-dirty.csv', index=False, header=news_df.columns.values)

In [196]:
# Create a datetime object to access the day of the week
news_df['datetime'] = ['2020-' + item.strip() for item in news_df['date_time']] 
news_df['datetime'] = pd.to_datetime(news_df['datetime'], format='%Y-%b-%d %H:%M:%p')
news_df['dayofweek'] = [item.dayofweek for item in news_df['datetime']]
news_df['date'] = pd.to_datetime(news_df['datetime'].dt.date)
news_df['time'] = news_df['datetime'].dt.time
news_df.drop('date_time', axis=1, inplace=True)

# news_df.to_csv('11-15-tingo-stock.csv', index=False, header=news_df.columns.values)
news_df.head()

Unnamed: 0,ticker,headline,article,source,datetime,dayofweek,date,time
0,AAPL,PlayStation 5 Might Outsell New Xbox Series B...,Sony Corporation’s (NYSE: SNE) PlayStation 5 a...,benzinga.com,2020-11-16 05:00:00,0,2020-11-16,05:00:00
1,AAPL,iPhone 12 Pro and Pro Max nearly beat all DxO...,Although it was a pioneer in the smartphone ph...,slashgear.com,2020-11-16 04:36:00,0,2020-11-16,04:36:00
2,AAPL,Samsung Smart Monitor is also a smart TV with...,Computer monitors are a dime a dozen these day...,slashgear.com,2020-11-16 04:03:00,0,2020-11-16,04:03:00
3,AAPL,The most powerful CPU of 2020 may be coming s...,AMD’s new EPYC 7763 CPU pushes x86 boundaries ...,techradar.com,2020-11-16 04:00:00,0,2020-11-16,04:00:00
4,AAPL,"iPhone 12 group SMS disappear, iPhone 12 mini...",No product can claim to be perfect the day it ...,slashgear.com,2020-11-16 03:37:00,0,2020-11-16,03:37:00


In [209]:
combined_df = pd.merge(news_df, yahoo, on='ticker')
# Replace commas to convert to an integer
combined_df['volume'] = combined_df['volume'].replace(r',', '', regex=True).astype(int)
combined_df['fair_value'] = combined_df['fair_value'].astype('category')

# combined_df.to_csv('11-15-combined-stock.csv', index=False, header=combined_df.columns.values)
combined_df.head()

Unnamed: 0,ticker,headline,article,source,datetime,dayofweek,date,time,closing,opening,days_range,volume,fair_value
0,AAPL,PlayStation 5 Might Outsell New Xbox Series B...,Sony Corporation’s (NYSE: SNE) PlayStation 5 a...,benzinga.com,2020-11-16 05:00:00,0,2020-11-16,05:00:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued
1,AAPL,iPhone 12 Pro and Pro Max nearly beat all DxO...,Although it was a pioneer in the smartphone ph...,slashgear.com,2020-11-16 04:36:00,0,2020-11-16,04:36:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued
2,AAPL,Samsung Smart Monitor is also a smart TV with...,Computer monitors are a dime a dozen these day...,slashgear.com,2020-11-16 04:03:00,0,2020-11-16,04:03:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued
3,AAPL,The most powerful CPU of 2020 may be coming s...,AMD’s new EPYC 7763 CPU pushes x86 boundaries ...,techradar.com,2020-11-16 04:00:00,0,2020-11-16,04:00:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued
4,AAPL,"iPhone 12 group SMS disappear, iPhone 12 mini...",No product can claim to be perfect the day it ...,slashgear.com,2020-11-16 03:37:00,0,2020-11-16,03:37:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued


## Sentiment Analysis using NLTK

In [210]:
# Initialization
analyzer = SentimentIntensityAnalyzer()
# Apply the algorithm to the headline column
analyzer_scores = [analyzer.polarity_scores(item)['compound'] for item in combined_df['headline']]
# Create a dataframe from the compounded polarity scores
analyzer_df = pd.DataFrame(analyzer_scores, columns=['compound'])
# combine the two dataframes
analyzer_news = pd.concat([combined_df, analyzer_df], axis=1)
# Remove entries where the algorithm either did not find anything or was not able to calculate anything
analyzer_news = analyzer_news[analyzer_news['compound'] != 0].reset_index(drop=True)
analyzer_news.head()

Unnamed: 0,ticker,headline,article,source,datetime,dayofweek,date,time,closing,opening,days_range,volume,fair_value,compound
0,AAPL,Samsung Smart Monitor is also a smart TV with...,Computer monitors are a dime a dozen these day...,slashgear.com,2020-11-16 04:03:00,0,2020-11-16,04:03:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.7964
1,AAPL,The most powerful CPU of 2020 may be coming s...,AMD’s new EPYC 7763 CPU pushes x86 boundaries ...,techradar.com,2020-11-16 04:00:00,0,2020-11-16,04:00:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.2609
2,AAPL,"iPhone 12 group SMS disappear, iPhone 12 mini...",No product can claim to be perfect the day it ...,slashgear.com,2020-11-16 03:37:00,0,2020-11-16,03:37:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,-0.2263
3,AAPL,iShares Core U.S. Growth ETF: Technology Is T...,The fund has outperformed the S&P 500 by a sig...,seekingalpha.com,2020-11-16 02:48:00,0,2020-11-16,02:48:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.3818
4,AAPL,iPhone 12 mini iFixit teardown reveals what s...,Apple has definitely come a long way since the...,slashgear.com,2020-11-16 02:21:00,0,2020-11-16,02:21:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.3612


## NLTK - `dayofweek` using Plotly

In [212]:
# Group by the ticker and the day of the week, creating a multi-index
mean_day = analyzer_news.groupby(['ticker', 'dayofweek']).mean()
# Unstack, the innermost index level 'unstacks' across the columns
mean_day = mean_day.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_day = mean_day.xs('compound', axis=1).T
# Plotly colors
sunset = px.colors.sequential.Agsunset

px.bar(mean_day, barmode='group', color_discrete_sequence=sunset,
      title='Average Week Day Sentiment of Stock Headlines',
      labels={'value': 'Sentiment', 'dayofweek': 'Day of Week'})

## NLTK - `date` using Plotly

In [232]:
# Filter the dates
analyzer_filtered = analyzer_news[analyzer_news['date'] > '2020-10-31']
# Group by the ticker and the date, creating a multi-index
mean_date = analyzer_filtered.groupby(['ticker', 'date']).mean()
# Drop the dayofweek because we don't need it
mean_date = mean_date.drop('dayofweek', axis=1)
# Unstack, the innermost index level 'unstacks' across the columns
mean_date = mean_date.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_date = mean_date.xs('compound', axis=1).T

sunset = px.colors.sequential.Sunsetdark

px.bar(mean_date, barmode='relative', color_discrete_sequence=sunset,
      title='Average Sentiment of Stock Headlines',
      labels={'value': 'Sentiment', 'date': 'Date'})

## Sentiment Analysis using TextBlob

In [221]:
# Creating a TextBlob item for each article's content
testimonial = [TextBlob(item) for item in combined_df['article']]
# Get the sentiment for each textblob object
blob = [item.sentiment for item in testimonial]
# Convert to a dataframe
blob_df = pd.DataFrame(blob)
# Combine the blob_df and full news_df
blob_news = pd.concat([combined_df, blob_df], axis=1)
# Remove entries where the algorithm either did not find anything or was not able to calculate anything
blob_news = blob_news[blob_news['polarity'] != 0].reset_index(drop=True)
blob_news.head()

Unnamed: 0,ticker,headline,article,source,datetime,dayofweek,date,time,closing,opening,days_range,volume,fair_value,polarity,subjectivity
0,AAPL,iPhone 12 Pro and Pro Max nearly beat all DxO...,Although it was a pioneer in the smartphone ph...,slashgear.com,2020-11-16 04:36:00,0,2020-11-16,04:36:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,-0.1,0.1
1,AAPL,Samsung Smart Monitor is also a smart TV with...,Computer monitors are a dime a dozen these day...,slashgear.com,2020-11-16 04:03:00,0,2020-11-16,04:03:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.6,1.0
2,AAPL,The most powerful CPU of 2020 may be coming s...,AMD’s new EPYC 7763 CPU pushes x86 boundaries ...,techradar.com,2020-11-16 04:00:00,0,2020-11-16,04:00:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.068182,0.477273
3,AAPL,"iPhone 12 group SMS disappear, iPhone 12 mini...",No product can claim to be perfect the day it ...,slashgear.com,2020-11-16 03:37:00,0,2020-11-16,03:37:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,0.45,0.883333
4,AAPL,Vaccine Powers REITs To Historic Rebound,U.S. equity markets climbed to record-highs th...,seekingalpha.com,2020-11-16 03:31:00,0,2020-11-16,03:31:00,119.21,119.44,117.87 - 119.67,81688586,Overvalued,-0.007576,0.598485


### Polarity based on `dayofweek` using Plotly

In [222]:
# blob_news.to_csv('11-15-tingo-blob.csv', header=blob_news.columns.values, index=False)

In [256]:
# Group by the ticker and the day of the week, creating a multi-index
mean_day = blob_news.groupby(['ticker', 'dayofweek']).mean()
# Unstack, the innermost index level 'unstacks' across the columns
mean_day = mean_day.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_day = mean_day.xs('polarity', axis=1).T
# Plotly colors
agg = px.colors.sequential.Aggrnyl

px.bar(mean_day, barmode='group', color_discrete_sequence=agg,
      title='Average Week Day Polarity of Stock Headlines',
      labels={'value': 'Polarity', 'dayofweek': 'Day of Week'})

### Polarity based on `date` using Plotly

In [255]:
# Filter the dates
blob_filtered = blob_news[(blob_news['date'] > '2020-10-31') & (blob_news['date'] < '2020-11-16')]
# Group by the ticker and the date, creating a multi-index
mean_date = blob_filtered.groupby(['ticker', 'date']).mean()
# Drop the dayofweek because we don't need it
mean_date = mean_date.drop('dayofweek', axis=1)
# Unstack, the innermost index level 'unstacks' across the columns
mean_date = mean_date.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_date = mean_date.xs('polarity', axis=1).T

rainbow = px.colors.sequential.Rainbow

px.bar(mean_date, barmode='relative', color_discrete_sequence=rainbow,
      title='Average Polarity of Stock Headlines',
      labels={'value': 'Polarity', 'date': 'Date'})

### Subjectivity based on `dayofweek` using Plotly

In [254]:
# Group by the ticker and the day of the week, creating a multi-index
mean_day = blob_news.groupby(['ticker', 'dayofweek']).mean()
# Unstack, the innermost index level 'unstacks' across the columns
mean_day = mean_day.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_day = mean_day.xs('subjectivity', axis=1).T
# Plotly colors
purp = px.colors.sequential.Purp

px.bar(mean_day, barmode='group', color_discrete_sequence=purp,
      title='Average Week Day Polarity of Stock Headlines',
      labels={'value': 'Subjectivity', 'dayofweek': 'Day of Week'})

### Subjectivity based on `date` using Plotly

In [253]:
# Filter the dates
blob_filtered = blob_news[(blob_news['date'] > '2020-10-31') & (blob_news['date'] < '2020-11-16')]
# Group by the ticker and the date, creating a multi-index
mean_date = blob_filtered.groupby(['ticker', 'date']).mean()
# Drop the dayofweek because we don't need it
mean_date = mean_date.drop('dayofweek', axis=1)
# Unstack, the innermost index level 'unstacks' across the columns
mean_date = mean_date.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_date = mean_date.xs('subjectivity', axis=1).T

sunset = px.colors.sequential.Sunset

px.bar(mean_date, barmode='relative', color_discrete_sequence=sunset,
      title='Average Polarity of Stock Headlines',
      labels={'value': 'Subjectivity', 'date': 'Date'})

In [252]:
# Group by the ticker and the day of the week, creating a multi-index
mean_day = blob_news.groupby(['ticker', 'date']).mean()
# Unstack, the innermost index level 'unstacks' across the columns
mean_day = mean_day.unstack()
# Get a cross-section of the above data, and transpose it to make the columns the ticker names
mean_day = mean_day.xs('polarity', axis=1).T

px.box(mean_day, title='Distribution of the Polarity of Stock Headlines',
      labels={'value': 'Polarity'})