In [1]:
import os
import pandas as pd
from newsapi import NewsApiClient
%matplotlib inline

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [3]:
# Read your api key environment variable
api_key = os.getenv("news_api")

In [4]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)


In [5]:
# Fetch the Bitcoin news articles
bitcoin_news=newsapi.get_everything(q='Bitcoin',language='en',page_size=100)
bitcoin_news['articles'][0]

{'source': {'id': 'wired', 'name': 'Wired'},
 'author': 'Laura Mallonee',
 'title': 'Inside the Icelandic Facility Where Bitcoin Is Mined',
 'description': "Cryptocurrency mining now uses more of the Nordic island nation's electricity than its homes.",
 'url': 'https://www.wired.com/story/iceland-bitcoin-mining-gallery/',
 'urlToImage': 'https://media.wired.com/photos/5dbc37a4c955950008b26751/191:100/w_1280,c_limit/photo_barnard_explosions_4.jpg',
 'publishedAt': '2019-11-03T15:00:00Z',
 'content': 'Less than two miles from Icelands Reykjavik airport sits a nondescript metal building as monolithic and drab as a commercial poultry barn. Theres a deafening racket inside, too, but it doesnt come from clucking chickens. Instead, tens of thousands of whirring… [+3426 chars]'}

In [6]:
# Fetch the Ethereum news articles
ethereum_news=newsapi.get_everything(q='Ethereum',language='en',page_size=100)
ethereum_news['articles'][0]

{'source': {'id': 'mashable', 'name': 'Mashable'},
 'author': 'Miller Kern',
 'title': 'Debunking Blockchain once and for all',
 'description': "TL;DR: The in-depth Complete Blockchain and Ethereum Programmer bundle is on sale for just $24 with the code 20LEARN20. When it comes to Bitcoin, you get the gist: Cryptocurrency is on the rise right now, it's a worthy investment, and it when it comes to top …",
 'url': 'https://mashable.com/shopping/oct-20-blockchain-and-ethereum-programmer-online-courses/',
 'urlToImage': 'https://mondrian.mashable.com/2019%252F10%252F20%252F20%252Fd451f6dc63634e7ebc1a6884f0de511f.0108b.jpg%252F1200x630.jpg?signature=wXr1QUIZY8rWVXkvW6qxTgQk24E=',
 'publishedAt': '2019-10-20T09:00:00Z',
 'content': "TL;DR: The in-depth Complete Blockchain and Ethereum Programmer bundle is on sale for just $24 with the code 20LEARN20.\r\nWhen it comes to Bitcoin, you get the gist: Cryptocurrency is on the rise right now, it's a worthy investment, and it when it comes to top… 

In [8]:
def generate_sentiments(articles):
    sentiments=[]
    for article in articles:
        try:
            text = article["content"]
            date = article["publishedAt"][:10]
            sentiment = analyzer.polarity_scores(text)
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]

            sentiments.append({
                "Text": text,
                "Date": date,
                "Compound": compound,
                "Positive": pos,
                "Negative": neg,
                "Neutral": neu

            })
        except AttributeError:
            pass
                
    return sentiments

In [61]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiments=pd.DataFrame(generate_sentiments(bitcoin_news['articles']))
cols = ['Compound','Negative','Neutral','Positive','Text']
bitcoin_sentiments = bitcoin_sentiments[cols]

bitcoin_sentiments.head()


Unnamed: 0,Compound,Negative,Neutral,Positive,Text
0,-0.1531,0.039,0.961,0.0,Less than two miles from Icelands Reykjavik ai...
1,0.2263,0.0,0.96,0.04,At least that's the idea. While it's not as in...
2,0.0,0.0,1.0,0.0,High school students in France may be among th...
3,0.4404,0.0,0.941,0.059,TL;DR: The in-depth Complete Blockchain and Et...
4,0.507,0.0,0.931,0.069,Our robot colleague Satoshi Nakaboto writes ab...


In [11]:
# Create the ethereum sentiment scores DataFrame
ethereum_sentiments=pd.DataFrame(generate_sentiments(ethereum_news['articles']))
cols = ['Compound','Negative','Neutral','Positive','Text']
ethereum_sentiments = ethereum_sentiments[cols]

ethereum_sentiments.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,Text
0,0.4404,0.0,0.941,0.059,TL;DR: The in-depth Complete Blockchain and Et...
1,0.6249,0.0,0.9,0.1,Disrupt Berlin is right around the corner. And...
2,0.2263,0.0,0.96,0.04,At least that's the idea. While it's not as in...
3,0.4601,0.0,0.915,0.085,"he plaBy now, you should almost certainly know..."
4,-0.3089,0.055,0.945,0.0,Blockchain has been picking up the pace since ...


In [12]:
# Describe the Bitcoin Sentiment
bitcoin_sentiments.describe()

Unnamed: 0,Compound,Negative,Neutral,Positive
count,97.0,97.0,97.0,97.0
mean,0.145687,0.039804,0.899031,0.061186
std,0.43985,0.054928,0.075462,0.053192
min,-0.8316,0.0,0.64,0.0
25%,-0.1027,0.0,0.857,0.0
50%,0.1779,0.0,0.926,0.066
75%,0.507,0.068,0.945,0.087
max,0.9141,0.271,1.0,0.249


In [13]:
# Describe the Ethereum Sentiment
ethereum_sentiments.describe()

Unnamed: 0,Compound,Negative,Neutral,Positive
count,96.0,96.0,96.0,96.0
mean,0.178518,0.026906,0.916385,0.056698
std,0.410332,0.048792,0.072458,0.055863
min,-0.8779,0.0,0.551,0.0
25%,0.0,0.0,0.8895,0.0
50%,0.25,0.0,0.93,0.057
75%,0.5021,0.04525,0.958,0.0985
max,0.8221,0.287,1.0,0.239


### Questions:

Q: Which coin had the highest mean positive score?

A: 

Q: Which coin had the highest compound score?

A: 

Q. Which coin had the highest positive score?

A: 

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [14]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [None]:
# Expand the default stopwords list if necessary

In [59]:
# Complete the tokenizer function
def tokenizer(text):
    regex = re.compile("[^a-zA-Z ]")
    lemmatizer = WordNetLemmatizer()
    sentence_clean=regex.sub('', text)
    words=word_tokenize(sentence_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    sw = set(stopwords.words('english'))
    sw_addon = {'(',')',',','.','[',']',':',':'}
    output = [word.lower() for word in lem if word.lower() not in sw.union(sw_addon)]
    return output


In [62]:
# Create a new tokens column for bitcoin
bitcoin_sentiments['Tokens']=[tokenizer(r) for r in bitcoin_sentiments['Text']]
bitcoin_sentiments

Unnamed: 0,Compound,Negative,Neutral,Positive,Text,Tokens
0,-0.1531,0.039,0.961,0.000,Less than two miles from Icelands Reykjavik ai...,"[less, two, mile, icelands, reykjavik, airport..."
1,0.2263,0.000,0.960,0.040,At least that's the idea. While it's not as in...,"[least, thats, idea, intensive, mining, bitcoi..."
2,0.0000,0.000,1.000,0.000,High school students in France may be among th...,"[high, school, student, france, may, among, fi..."
3,0.4404,0.000,0.941,0.059,TL;DR: The in-depth Complete Blockchain and Et...,"[tldr, indepth, complete, blockchain, ethereum..."
4,0.5070,0.000,0.931,0.069,Our robot colleague Satoshi Nakaboto writes ab...,"[robot, colleague, satoshi, nakaboto, writes, ..."
...,...,...,...,...,...,...
92,0.0000,0.000,1.000,0.000,Bitcoin prices have quietly been in meltdown m...,"[bitcoin, price, quietly, meltdown, mode, rece..."
93,0.1531,0.037,0.911,0.051,Illustration by Stephen Shankland/CNET\r\nAfte...,"[illustration, stephen, shanklandcnetafter, fo..."
94,0.2023,0.047,0.887,0.067,After seeing its IPO dreams slashed earlier th...,"[seeing, ipo, dream, slashed, earlier, year, b..."
95,-0.7140,0.184,0.736,0.080,"I hate to be the bearer of bad news, but victi...","[hate, bearer, bad, news, victim, lost, money,..."


In [63]:
# Create a new tokens column for ethereum
ethereum_sentiments['Tokens']=[tokenizer(r) for r in ethereum_sentiments['Text']]
ethereum_sentiments

Unnamed: 0,Compound,Negative,Neutral,Positive,Text,Tokens
0,0.4404,0.000,0.941,0.059,TL;DR: The in-depth Complete Blockchain and Et...,"[tldr, indepth, complete, blockchain, ethereum..."
1,0.6249,0.000,0.900,0.100,Disrupt Berlin is right around the corner. And...,"[disrupt, berlin, right, around, corner, plent..."
2,0.2263,0.000,0.960,0.040,At least that's the idea. While it's not as in...,"[least, thats, idea, intensive, mining, bitcoi..."
3,0.4601,0.000,0.915,0.085,"he plaBy now, you should almost certainly know...","[plaby, almost, certainly, know, ethereum, eth..."
4,-0.3089,0.055,0.945,0.000,Blockchain has been picking up the pace since ...,"[blockchain, ha, picking, pace, since, incepti..."
...,...,...,...,...,...,...
91,-0.5423,0.106,0.894,0.000,A Spanish financial watchdog has issued a warn...,"[spanish, financial, watchdog, ha, issued, war..."
92,0.3818,0.000,0.936,0.064,Noelle Acheson is a veteran of company analysi...,"[noelle, acheson, veteran, company, analysis, ..."
93,0.8221,0.000,0.798,0.202,"Ethereum co-founder and founder of Consensys, ...","[ethereum, cofounder, founder, consensys, jose..."
94,0.1027,0.000,0.968,0.032,Ethereum and Bitcoin (BTC) did not have to com...,"[ethereum, bitcoin, btc, comply, regulation, a..."


---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [64]:
from collections import Counter
from nltk import ngrams

In [73]:
# Generate the Bitcoin N-grams where N=2
bitcoin_bigram_counts = [dict(Counter(ngrams(r, n=2))) for r in bitcoin_sentiments.Tokens]
bitcoin_bigram_counts[0]

{('less', 'two'): 1,
 ('two', 'mile'): 1,
 ('mile', 'icelands'): 1,
 ('icelands', 'reykjavik'): 1,
 ('reykjavik', 'airport'): 1,
 ('airport', 'sits'): 1,
 ('sits', 'nondescript'): 1,
 ('nondescript', 'metal'): 1,
 ('metal', 'building'): 1,
 ('building', 'monolithic'): 1,
 ('monolithic', 'drab'): 1,
 ('drab', 'commercial'): 1,
 ('commercial', 'poultry'): 1,
 ('poultry', 'barn'): 1,
 ('barn', 'theres'): 1,
 ('theres', 'deafening'): 1,
 ('deafening', 'racket'): 1,
 ('racket', 'inside'): 1,
 ('inside', 'doesnt'): 1,
 ('doesnt', 'come'): 1,
 ('come', 'clucking'): 1,
 ('clucking', 'chicken'): 1,
 ('chicken', 'instead'): 1,
 ('instead', 'ten'): 1,
 ('ten', 'thousand'): 1,
 ('thousand', 'whirring'): 1,
 ('whirring', 'char'): 1}

In [74]:
# Generate the Ethereum N-grams where N=2
ethereum_bigram_counts = [dict(Counter(ngrams(r, n=2))) for r in ethereum_sentiments.Tokens]
ethereum_bigram_counts[0]

{('tldr', 'indepth'): 1,
 ('indepth', 'complete'): 1,
 ('complete', 'blockchain'): 1,
 ('blockchain', 'ethereum'): 1,
 ('ethereum', 'programmer'): 1,
 ('programmer', 'bundle'): 1,
 ('bundle', 'sale'): 1,
 ('sale', 'code'): 1,
 ('code', 'learnwhen'): 1,
 ('learnwhen', 'come'): 1,
 ('come', 'bitcoin'): 1,
 ('bitcoin', 'get'): 1,
 ('get', 'gist'): 1,
 ('gist', 'cryptocurrency'): 1,
 ('cryptocurrency', 'rise'): 1,
 ('rise', 'right'): 1,
 ('right', 'worthy'): 1,
 ('worthy', 'investment'): 1,
 ('investment', 'come'): 1,
 ('come', 'top'): 1,
 ('top', 'char'): 1}

In [81]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    bigrams = ngrams(tokens, n=2)
    top_10 = dict(Counter(bigrams).most_common(N))
    return pd.DataFrame(list(top_10.items()), columns=['bigram', 'count'])
   

In [82]:
# Get the top 10 words for Bitcoin
bitcoin_tokens=[' '.join(r) for r in bitcoin_sentiments.Tokens]
bitcoin_df=token_count(bitcoin_tokens)
bitcoin_df

Unnamed: 0,bigram,count
0,(less two mile icelands reykjavik airport sits...,1
1,(least thats idea intensive mining bitcoin run...,1
2,(high school student france may among first pe...,1
3,(tldr indepth complete blockchain ethereum pro...,1
4,(robot colleague satoshi nakaboto writes bitco...,1
5,(robot colleague satoshi nakaboto writes bitco...,1
6,(robot colleague satoshi nakaboto writes bitco...,1
7,(robot colleague satoshi nakaboto writes bitco...,1
8,(twitter square ceo jack dorsey ha attended bi...,1
9,(robot colleague satoshi nakaboto writes bitco...,1


In [None]:
# Get the top 10 words for Ethereum

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [None]:
# Generate the Bitcoin word cloud

In [None]:
# Generate the Ethereum word cloud

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [None]:
import spacy
from spacy import displacy

In [None]:
# Optional - download a language model for SpaCy
# !python -m spacy download en_core_web_sm

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [None]:
# Concatenate all of the bitcoin text together

In [None]:
# Run the NER processor on all of the text

# Add a title to the document

In [None]:
# Render the visualization

In [None]:
# List all Entities

---

## Ethereum NER

In [None]:
# Concatenate all of the bitcoin text together

In [None]:
# Run the NER processor on all of the text

# Add a title to the document

In [None]:
# Render the visualization

In [None]:
# List all Entities