# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [17]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from pathlib import Path
load_dotenv()
# %matplotlib inline

True

In [18]:
# Read your api key environment variable
news_api = "2513143cbaaf40a3ac124571b6c48157"
# type(news_api)


In [19]:
# Create a newsapi client
from newsapi import NewsApiClient
newsapi = NewsApiClient(api_key=news_api)

In [20]:
# Fetch the Bitcoin news articles
bitcoin_headlines = newsapi.get_everything(
    q="bitcoin", language="en",page_size=100, sort_by="relevancy"
)

# Print total articles
print(f"Total articles about Bitcoin: {bitcoin_headlines['totalResults']}")

# Show sample article
bitcoin_headlines["articles"][0]

Total articles about Bitcoin: 4726


{'source': {'id': 'wired', 'name': 'Wired'},
 'author': 'Timothy B. Lee, Ars Technica',
 'title': 'An Engineer Gets 9 Years for Stealing $10M From Microsoft',
 'description': 'The defendant tried—and failed—to use bitcoin to cover his tracks.',
 'url': 'https://www.wired.com/story/an-engineer-gets-9-years-for-stealing-dollar10m-from-microsoft/',
 'urlToImage': 'https://media.wired.com/photos/5fac6afb446b4639b3d5b8d8/191:100/w_1280,c_limit/Security-Microsoft-1229426260.jpg',
 'publishedAt': '2020-11-12T14:00:00Z',
 'content': "A former Microsoft software engineer from Ukraine has been sentenced to nine years in prison for stealing more than $10 million in store credit from Microsoft's online store. From 2016 to 2018, Volod… [+3307 chars]"}

In [21]:
bitcoin_df = pd.DataFrame.from_dict(bitcoin_headlines['articles'])
bitcoin_df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'wired', 'name': 'Wired'}","Timothy B. Lee, Ars Technica",An Engineer Gets 9 Years for Stealing $10M Fro...,The defendant tried—and failed—to use bitcoin ...,https://www.wired.com/story/an-engineer-gets-9...,https://media.wired.com/photos/5fac6afb446b463...,2020-11-12T14:00:00Z,A former Microsoft software engineer from Ukra...
1,"{'id': None, 'name': 'Lifehacker.com'}","Mike Winters on Two Cents, shared by Mike Wint...",Is the New Visa Bitcoin Rewards Card Worth It?,Visa has partnered with cryptocurrency startup...,https://twocents.lifehacker.com/is-the-new-vis...,https://i.kinja-img.com/gawker-media/image/upl...,2020-12-03T22:00:00Z,Visa has partnered with cryptocurrency startup...
2,"{'id': 'engadget', 'name': 'Engadget'}",Karissa Bell,"PayPal now lets all US users buy, sell and hol...",PayPal is bringing its newly-announced support...,https://www.engadget.com/paypal-opens-cryptocu...,https://o.aolcdn.com/images/dims?resize=1200%2...,2020-11-12T21:05:41Z,PayPal is bringing its newly-announced support...
3,"{'id': 'mashable', 'name': 'Mashable'}",Stan Schroeder,"Bitcoin is flirting with $20,000 again. How hi...","In November 2017, after an absolutely massive,...",https://mashable.com/article/bitcoin-20000/,https://mondrian.mashable.com/2020%252F11%252F...,2020-11-20T20:02:17Z,"In November 2017, after an absolutely massive,..."
4,"{'id': 'engadget', 'name': 'Engadget'}",Jon Fingas,You can now spend China's digital currency at ...,China’s official digital currency is now usabl...,https://www.engadget.com/jd-com-supports-china...,https://o.aolcdn.com/images/dims?resize=1200%2...,2020-12-06T22:37:18Z,"Unlike ‘conventional’ cryptocurrencies, a cent..."


In [23]:
file_path = Path("../Data/")
bitcoin_df.to_csv(file_path, index=False, encoding='utf-8-sig')

In [24]:
file_path = Path("../Data/")
bitcoin_df.to_pickle(file_path)

In [25]:
# Fetch the Ethereum news articles
ethereum_headlines = newsapi.get_everything(
    q="ethereum and Ethereum", language="en",page_size=100, sort_by="relevancy"
)

# Print total articles
print(f"Total articles about Ethereum: {ethereum_headlines['totalResults']}")

# Show sample article
ethereum_headlines["articles"][0]

Total articles about Ethereum: 1279


{'source': {'id': 'engadget', 'name': 'Engadget'},
 'author': 'Karissa Bell',
 'title': 'PayPal now lets all US users buy, sell and hold cryptocurrency',
 'description': 'PayPal is bringing its newly-announced support for cryptocurrency to all US accounts. It first announced plans to open cryptocurrency trading to US-based users in October, but until now it was only available to a small subset of PayPal account holders. That’s…',
 'url': 'https://www.engadget.com/paypal-opens-cryptocurrency-all-us-accounts-210541778.html',
 'urlToImage': 'https://o.aolcdn.com/images/dims?resize=1200%2C630&crop=1200%2C630%2C0%2C0&quality=95&image_uri=https%3A%2F%2Fs.yimg.com%2Fos%2Fcreatr-uploaded-images%2F2020-11%2F4e1af080-2528-11eb-bbdf-193e571d142a&client=amp-blogside-v2&signature=22ad23dde6ee3e667a8067a4c39e1962659cec92',
 'publishedAt': '2020-11-12T21:05:41Z',
 'content': 'PayPal is bringing its newly-announced support for cryptocurrency to all US accounts. It first announced plans to open cryptoc

In [26]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiments = []

for article in bitcoin_headlines["articles"]:
    try:
        
        text = article["content"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        neg = sentiment["neg"]
        neu = sentiment["neu"]
        pos = sentiment["pos"]
        
        bitcoin_sentiments.append({
            "Compound": compound,
            "Negative": neg,
            "Neutral": neu,
            "Positive": pos,
            "Text": text,
 
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
bitcoin_sentiments_df = pd.DataFrame(bitcoin_sentiments)

# Reorder DataFrame columns
cols = ["Compound", "Negative", "Neutral", "Positive", "Text"]
bitcoin_sentiments_df = bitcoin_sentiments_df[cols]

bitcoin_sentiments_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text
0,-0.6705,0.199,0.737,0.064,A former Microsoft software engineer from Ukra...
1,0.6369,0.0,0.838,0.162,Visa has partnered with cryptocurrency startup...
2,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...
3,0.2023,0.0,0.95,0.05,"In November 2017, after an absolutely massive,..."
4,0.0,0.0,1.0,0.0,"Unlike ‘conventional’ cryptocurrencies, a cent..."


In [27]:
file_path = Path("../Data/")
bitcoin_sentiments_df.to_csv(file_path, index=False, encoding='utf-8-sig')

In [28]:
file_path = Path("../Data/")
bitcoin_sentiments_df.to_pickle(file_path)

In [29]:
# Create the ethereum sentiment scores DataFrame
ethereum_sentiments = []

for article in ethereum_headlines["articles"]:
    try:
        
        text = article["content"]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        neg = sentiment["neg"]
        neu = sentiment["neu"]
        pos = sentiment["pos"]
        
        ethereum_sentiments.append({
            "Compound": compound,
            "Negative": neg,
            "Neutral": neu,
            "Positive": pos,
            "Text": text,
 
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
ethereum_sentiments_df = pd.DataFrame(ethereum_sentiments)

# Reorder DataFrame columns
cols = ["Compound", "Negative", "Neutral", "Positive", "Text"]
ethereum_sentiments_df = ethereum_sentiments_df[cols]

ethereum_sentiments_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text
0,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...
1,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...
2,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...
3,0.4215,0.0,0.912,0.088,LONDON (Reuters) - Digital currencies Ethereum...
4,0.8779,0.0,0.682,0.318,"PayPal has launched the Generosity Network, a ..."


In [30]:
# Describe the Bitcoin Sentiment
bitcoin_sentiments_df.describe()

Unnamed: 0,Compound,Negative,Neutral,Positive
count,98.0,98.0,98.0,98.0
mean,0.151223,0.020847,0.922173,0.05699
std,0.338619,0.053264,0.083086,0.065976
min,-0.9468,0.0,0.637,0.0
25%,0.0,0.0,0.858,0.0
50%,0.0,0.0,0.948,0.05
75%,0.4166,0.0,1.0,0.12
max,0.8779,0.363,1.0,0.318


In [31]:
# Describe the Ethereum Sentiment
ethereum_sentiments_df.describe()

Unnamed: 0,Compound,Negative,Neutral,Positive
count,99.0,99.0,99.0,99.0
mean,0.210089,0.021768,0.904606,0.073626
std,0.365659,0.044677,0.087777,0.077356
min,-0.6705,0.0,0.653,0.0
25%,0.0,0.0,0.849,0.0
50%,0.1779,0.0,0.912,0.07
75%,0.50225,0.0,1.0,0.1295
max,0.8834,0.196,1.0,0.347


### Questions:

Q: Which coin had the highest mean positive score?

A: 

Q: Which coin had the highest compound score?

A: 

Q. Which coin had the highest positive score?

A: 

In [None]:
Answer A :Bitcoin has the higher mean positive score compared to ethereum, 0.068 vs. 0.068.
Answer B :Based on the descriptive statistics, Ethereum has the highest compound score that topped at 0.90 vs. the maximum compound score of 0.79 from Ethereum.
However, Bitcoin has compound scores higher than Ethereum on its average, minimum, 25th, 50th and 75th percentiles.     

Answer C : Ethereum has the highest positive score with its maximum at 0.21 rather than the 0.14 received by Bitcoin.
Sentiments towards Ethereum are more volatile as the standard deviations are higher than those for Bitcoin on scores for compound, negative, neutral and positive texts.

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [50]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [51]:
# Code to download wordnet corpora
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mischelle.massey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
def tokenizer(text):
    """Tokenizes text."""
# Remove the punctuation
regex = re.compile("[^a-zA-Z ]")
clean_text = regex.sub('', text)
    
# Create a list of the words
sentence = sent_tokenize(clean_text)
words = []
for sent in sentence:
    word = word_tokenize(sent)
    words = words + word    
        
# Convert the words to lowercase
words_lc = [word.lower() for word in words]  


# Remove the stop words
sw = set(stopwords.words('english'))
words_sw = [word for word in words_lc if word not in sw]



In [53]:
print(words_sw)

['coindesk', 'snagged', 'frontrow', 'seat', 'seminal', 'event', 'crypto', 'industry', 'ticket', 'price', 'ethas', 'media', 'outlet', 'covering', 'nascent', 'technologies', 'believe', 'sometimes', 'chronicle', 'chars']


In [54]:
# Expand the default stopwords list if necessary
sw_addon = {"''", 'and','...','such', 'thus',"n't", "-", '""','_','upon','for','shall','just','onto'}

In [56]:
# Complete the tokenizer function
def tokenizer(Text):
    """Tokenizes text."""
    
    # Create a list of the words
    # Remove the punctuation
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', Text)
    words = word_tokenize(re_clean)
    
    # Lemmatize Words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
    
    # Convert the words to lowercase
    # Remove the stop words
    tokens = [word.lower() for word in lem if word.lower() not in sw]
#    tokens = [', '.join(output)] left to ngrams function
    
    return tokens


In [57]:
# Create a new tokens column for bitcoin
bitcoin_sentiments_df["Tokens"] = bitcoin_sentiments_df['Text'].apply(tokenizer)
bitcoin_sentiments_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text,Tokens
0,-0.6705,0.199,0.737,0.064,A former Microsoft software engineer from Ukra...,"[former, microsoft, software, engineer, ukrain..."
1,0.6369,0.0,0.838,0.162,Visa has partnered with cryptocurrency startup...,"[visahas, partnered, cryptocurrency, startup, ..."
2,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...,"[paypal, bringing, newlyannounced, support, cr..."
3,0.2023,0.0,0.95,0.05,"In November 2017, after an absolutely massive,...","[november, absolutely, massive, twomonth, rall..."
4,0.0,0.0,1.0,0.0,"Unlike ‘conventional’ cryptocurrencies, a cent...","[unlike, conventional, cryptocurrencies, centr..."


In [58]:
file_path = Path("../Data/")
bitcoin_sentiments_df.to_pickle(file_path)

In [59]:
file_path = Path("../Data/")
bitcoin_df.to_csv(file_path, index=False, encoding='utf-8-sig')

In [60]:
# Create a new tokens column for ethereum
ethereum_sentiments_df["Tokens"] = ethereum_sentiments_df['Text'].apply(tokenizer)
ethereum_sentiments_df.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text,Tokens
0,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...,"[paypal, bringing, newlyannounced, support, cr..."
1,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...,"[file, photo, representation, ethereum, virtua..."
2,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...,"[file, photo, representation, ethereum, virtua..."
3,0.4215,0.0,0.912,0.088,LONDON (Reuters) - Digital currencies Ethereum...,"[london, reuters, digital, currency, ethereum,..."
4,0.8779,0.0,0.682,0.318,"PayPal has launched the Generosity Network, a ...","[paypal, ha, launched, generosity, network, pl..."


In [61]:
file_path = Path("../Data/")
ethereum_sentiments_df.to_csv(file_path, index=False, encoding='utf-8-sig')

In [62]:
file_path = Path("../Data/")
ethereum_sentiments_df.to_pickle(file_path)

---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [63]:
from collections import Counter
from nltk import ngrams

from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from collections import Counter

In [64]:
# Define a bigram_counter function
def bigram_counter(tokens): 
    # Combine all articles in corpus into one large string
    big_string = ' '.join(tokens)
    processed = tokenizer(big_string) 
    bigrams = ngrams(processed, n=2)
    top_10 = dict(Counter(bigrams).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['Bigram', 'Count'])

In [65]:
# Generate the Bitcoin N-grams where N=2
tokens_bitcoin = bitcoin_sentiments_df.Tokens.astype('str')

In [66]:
print(f'Top 10 Bitcoin Bigrams')
bigram_counter(tokens_bitcoin)

Top 10 Bitcoin Bigrams


Unnamed: 0,Bigram,Count
0,"(virtual, currency)",31
1,"(reuters, bitcoin)",26
2,"(representation, virtual)",23
3,"(currency, bitcoin)",23
4,"(illustration, taken)",23
5,"(char, reuters)",21
6,"(photo, representation)",20
7,"(bitcoin, seen)",19
8,"(reuters, stafffile)",13
9,"(stafffile, photo)",13


In [67]:
# Generate the Ethereum N-grams where N=2
tokens_ethereum = ethereum_sentiments_df.Tokens.astype('str')

In [68]:
print(f'Top 10 Ethereum Bigrams')
bigram_counter(tokens_ethereum)

Top 10 Ethereum Bigrams


Unnamed: 0,Bigram,Count
0,"(virtual, currency)",10
1,"(char, bitcoin)",10
2,"(photo, representation)",8
3,"(york, reuters)",7
4,"(representation, virtual)",7
5,"(currency, bitcoin)",7
6,"(illustration, taken)",7
7,"(reuters, bitcoin)",7
8,"(char, ethereum)",7
9,"(char, new)",6


In [69]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [70]:
# Get the top 10 words for Bitcoin
print(f'Top 10 Bitcoin Words')
token_count(tokens_bitcoin, N=10)

Top 10 Bitcoin Words


[("['reuters', 'stafffile', 'photo', 'representations', 'virtual', 'currency', 'bitcoin', 'placed', 'us', 'dollar', 'banknote', 'illustration', 'taken', 'may', 'reutersdado', 'ruvicillustrationlondon', 'reuters', 'char']",
  3),
 ("['london', 'reuters', 'inflation', 'hedge', 'currency', 'another', 'niche', 'asset', 'take', 'punt', 'bitcoin', 'headline', 'rekindling', 'debate', 'true', 'role', 'driver', 'behind', 'char']",
  3),
 ("['file', 'photo', 'representation', 'virtual', 'currency', 'bitcoin', 'seen', 'front', 'stock', 'graph', 'illustration', 'taken', 'november', 'reutersdado', 'ruvicreuters', 'bitcoin', 'still', 'volatile', 'char']",
  3),
 ("['new', 'york', 'reuters', 'bitcoin', 'investor', 'include', 'top', 'hedge', 'fund', 'money', 'manager', 'betting', 'virtual', 'currency', 'could', 'quintuple', 'high', 'yearfile', 'photo', 'b', 'char']",
  3),
 ("['shanghaihong', 'kong', 'reuters', 'price', 'bitcoin', 'soar', 'chinese', 'cryptocurrency', 'asset', 'manager', 'looking', 'ex

In [71]:
# Get the top 10 words for Ethereum
print(f'Top 10 Ethereum Words')
token_count(tokens_ethereum, N=10)

Top 10 Ethereum Words


[("['file', 'photo', 'representation', 'ethereum', 'virtual', 'currency', 'standing', 'pc', 'motherboard', 'seen', 'illustration', 'picture', 'february', 'reutersdado', 'ruvicillustrationlondon', 'reuters', 'char']",
  2),
 ("['new', 'york', 'reuters', 'institutional', 'investor', 'pumped', 'million', 'cryptocurrency', 'fund', 'product', 'week', 'ended', 'dec', 'second', 'highest', 'record', 'pushing', 'sector', 'asset', 'manag', 'char']",
  2),
 ("['reuters', 'stafffile', 'photo', 'representation', 'virtual', 'currency', 'bitcoin', 'seen', 'front', 'stock', 'graph', 'illustration', 'taken', 'november', 'reutersdado', 'ruvicillustrationfile', 'photo', 'char']",
  2),
 ("['reuters', 'staffa', 'representation', 'virtual', 'currency', 'bitcoin', 'seen', 'illustration', 'taken', 'november', 'reutersdado', 'ruvicillustrationnew', 'york', 'reuters', 'bitcoin', 'btcbtsp', 'rose', 'n', 'char']",
  2),
 ("['london', 'reuters', 'bitcoin', 'passed', 'tuesday', 'touch', 'highest', 'level', 'year',

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [72]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]
from nltk.corpus import stopwords
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [73]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mischelle.massey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [79]:
# Generate the Bitcoin word cloud
corpus = bitcoin_headlines['Articles']

def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)

big_string = ' '.join(corpus)
input_text = process_text(big_string)

wc = WordCloud().generate(input_text)
plt.imshow(wc)

KeyError: 'Articles'

In [None]:
# Generate the Ethereum word cloud
corpus = ethereum_articles["description"]

big_string = ' '.join(corpus)
input_text = process_text(big_string)

wc = WordCloud().generate(input_text)
plt.imshow(wc)

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [None]:
import spacy
from spacy import displacy

In [None]:
# Optional - download a language model for SpaCy
!python -m spacy download en_core_web_sm

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [None]:
# Concatenate all of the bitcoin text together
article = ethereum_articles["description"].str.cat()
article.head()

In [None]:
# Run the NER processor on all of the text
doc = nlp(article)

# Add a title to the document
doc.user_data['title'] = 'Bitcoin NER'

In [None]:
# Render the visualization
bitcoin_entities = displacy.render(doc, style='ent', jupyter=True)

In [None]:
# List all Entities
for ent in doc_bitcoin.ents:
    print(ent.text, ent.label_)

---

## Ethereum NER

In [None]:
# Concatenate all of the bitcoin text together
text_strings_ethereum = ethereum_sentiments_df.Text.astype('str')
text_ethereum = ' '.join(text_strings_ethereum)
text_ethereum[:500]

In [None]:
# Run the NER processor on all of the text
doc_ethereum = nlp(text_ethereum)

# Add a title to the document
doc_ethereum.user_data['title'] = 'Ethereum NER'

In [None]:
# Render the visualization
displacy.render(doc_ethereum, style='ent', jupyter=True)

In [None]:
# List all Entities
for ent in doc_ethereum.ents:
    print(ent.text, ent.label_)