## 1. Sentiment Analysis

Use the [newsapi](https://newsapi.org/) to pull the latest news articles for Bitcoin and Litecoin and create a DataFrame of sentiment scores for each coin.

In [1]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mrnagleJR97/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Read your api key environment variable
load_dotenv()
api_key = os.getenv("news_api")

In [3]:
# Create a newsapi client
from newsapi import NewsApiClient

In [4]:
newsapi = NewsApiClient(api_key=api_key)

In [5]:
# Fetch the Bitcoin news articles
btc_headlines = newsapi.get_everything(
    q="bitcoin",
    language="en",
    sort_by="relevancy"
)

In [6]:
# Fetch the Litecoin news articles
lite_headlines = newsapi.get_everything(
    q="litecoin",
    language="en",
    sort_by="relevancy"
)

In [7]:
# Create the Bitcoin sentiment scores DataFrame
sentiments = []

for articles in btc_headlines["articles"]:
    try:
        text = articles["content"]
        results = analyzer.polarity_scores(text)
        compound = results["compound"]
        pos = results["pos"]
        neu = results["neu"]
        neg = results["neg"]
        
        sentiments.append({
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
            "text": text,
        })
    except AttributeError:
        pass
    
btc = pd.DataFrame(sentiments)
btc.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text
0,0.4215,0.0,0.902,0.098,PayPal will now allow users outside the U.S. t...
1,0.1779,0.0,0.948,0.052,A recently-installed Bitcoin ATM.\r\n\n \n\n A...
2,0.128,0.0,0.954,0.046,The government of El Salvador purchased at lea...
3,0.6187,0.0,0.847,0.153,Retailers are increasingly accepting cryptocur...
4,0.6908,0.0,0.839,0.161,"PayPal is bringing the ability to buy, hold an..."


In [8]:
# Create the Litecoin sentiment scores DataFrame
sentiments = []

for articles in lite_headlines["articles"]:
    try:
        text = articles["content"]
        results = analyzer.polarity_scores(text)
        compound = results["compound"]
        pos = results["pos"]
        neu = results["neu"]
        neg = results["neg"]
        
        sentiments.append({
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
            "text": text,
        })
    except AttributeError:
        pass
    
lite = pd.DataFrame(sentiments)
lite.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text
0,0.2732,0.09,0.758,0.152,An apparently fake press release said the reta...
1,0.4215,0.0,0.902,0.098,PayPal will now allow users outside the U.S. t...
2,0.6908,0.0,0.839,0.161,"PayPal is bringing the ability to buy, hold an..."
3,0.3365,0.0,0.936,0.064,"Editor's Note: With so much market volatility,..."
4,0.1546,0.123,0.7,0.177,"""Don't worry about people stealing your ideas...."


In [9]:
# Describe the Bitcoin Sentiment
btc.describe()


Unnamed: 0,Compound,Negative,Neutral,Positive
count,20.0,20.0,20.0,20.0
mean,0.154425,0.02145,0.9167,0.06185
std,0.307322,0.037859,0.049952,0.046084
min,-0.5719,0.0,0.833,0.0
25%,0.0,0.0,0.89775,0.044
50%,0.128,0.0,0.9075,0.0505
75%,0.4068,0.043,0.951,0.09425
max,0.6908,0.115,1.0,0.161


In [10]:
# Describe the Litecoin Sentiment
lite.describe()


Unnamed: 0,Compound,Negative,Neutral,Positive
count,20.0,20.0,20.0,20.0
mean,0.068515,0.0426,0.89265,0.06475
std,0.416944,0.064033,0.083114,0.065225
min,-0.7351,0.0,0.7,0.0
25%,-0.056575,0.0,0.842,0.0
50%,0.19045,0.0,0.9035,0.0625
75%,0.35775,0.09675,0.94075,0.10775
max,0.6908,0.181,1.0,0.177


---

## 2. Natural Language Processing

1. Lowercase each word.
2. Remove Punctuation.
3. Remove Stopwords.

In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [12]:
# Instantiate the lemmatizer
wnl = WordNetLemmatizer()

# Create a list of stopwords
stop = stopwords.words("english")

# Expand the default stopwords list if necessary
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('...')
stop.append("\`")
stop.append('``')
stop.append('char')
stop.append("''")
stop = set(stop)

In [13]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    
    # Create a tokenized list of the words
    words = word_tokenize(text)
    
    
    # Convert the words to lowercase
    words = list(filter(lambda w: w.lower(), words))
    
    
    # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))
    
    
    # Remove the stop words
    words = list(filter(lambda t: t.lower() not in stop, words))
    
    # Lemmatize words into root words
    tokens = [wnl.lemmatize(word) for word in words]
   
    return tokens


In [14]:
# Create a new tokens column for Bitcoin
btc["tokens"] = btc.text.apply(tokenizer)
btc.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text,tokens
0,0.4215,0.0,0.902,0.098,PayPal will now allow users outside the U.S. t...,"[PayPal, allow, user, outside, U.S., buy, hold..."
1,0.1779,0.0,0.948,0.052,A recently-installed Bitcoin ATM.\r\n\n \n\n A...,"[recently-installed, Bitcoin, ATM, today, Bitc..."
2,0.128,0.0,0.954,0.046,The government of El Salvador purchased at lea...,"[government, El, Salvador, purchased, least, 2..."
3,0.6187,0.0,0.847,0.153,Retailers are increasingly accepting cryptocur...,"[Retailers, increasingly, accepting, cryptocur..."
4,0.6908,0.0,0.839,0.161,"PayPal is bringing the ability to buy, hold an...","[PayPal, bringing, ability, buy, hold, sell, c..."


In [15]:
# Create a new tokens column for Litecoin
lite["tokens"] = lite.text.apply(tokenizer)
lite.head()

Unnamed: 0,Compound,Negative,Neutral,Positive,text,tokens
0,0.2732,0.09,0.758,0.152,An apparently fake press release said the reta...,"[apparently, fake, press, release, said, retai..."
1,0.4215,0.0,0.902,0.098,PayPal will now allow users outside the U.S. t...,"[PayPal, allow, user, outside, U.S., buy, hold..."
2,0.6908,0.0,0.839,0.161,"PayPal is bringing the ability to buy, hold an...","[PayPal, bringing, ability, buy, hold, sell, c..."
3,0.3365,0.0,0.936,0.064,"Editor's Note: With so much market volatility,...","[Editor, Note, much, market, volatility, stay,..."
4,0.1546,0.123,0.7,0.177,"""Don't worry about people stealing your ideas....","[worry, people, stealing, idea, idea, good, 'l..."


---

### NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [16]:
from collections import Counter
from nltk import ngrams

In [17]:
# Generate the Bitcoin N-grams where N=2
N = 2
grams = ngrams(tokenizer(btc.text.str.cat()), N)
Counter(grams).most_common(20)

[(('El', 'Salvador'), 7),
 (('buy', 'hold'), 3),
 (('hold', 'sell'), 3),
 (('Central', 'American'), 3),
 (('American', 'country'), 3),
 (('became', 'first'), 3),
 (('char', 'PayPal'), 3),
 (('PayPal', 'allow'), 2),
 (('allow', 'user'), 2),
 (('first', 'time'), 2),
 (('allow', 'customer'), 2),
 (('sell', 'cryptocurrencies'), 2),
 (('New', 'York'), 2),
 (('Photo/File', 'Photo/File'), 2),
 (('char', 'T-Mobile'), 2),
 (('char', 'Posted'), 2),
 (('Posted', 'El'), 2),
 (('El', 'Zonte'), 2),
 (('Zonte', 'El'), 2),
 (('Salvador', 'home'), 2)]

In [18]:
# Generate the Ethereum N-grams where N=2
N = 2
grams = ngrams(tokenizer(lite.text.str.cat()), N)
Counter(grams).most_common(20)

[(('press', 'release'), 5),
 (('char', 'PayPal'), 4),
 (('buy', 'hold'), 3),
 (('hold', 'sell'), 3),
 (('first', 'time'), 3),
 (('illustration', 'taken'), 3),
 (('2021', 'REUTERS/Dado'), 3),
 (('fake', 'press'), 2),
 (('PayPal', 'allow'), 2),
 (('allow', 'user'), 2),
 (('allow', 'customer'), 2),
 (('sell', 'cryptocurrencies'), 2),
 (('social', 'medium'), 2),
 (('apparently', 'fake'), 1),
 (('release', 'said'), 1),
 (('said', 'retailer'), 1),
 (('retailer', 'would'), 1),
 (('would', 'begin'), 1),
 (('begin', 'accepting'), 1),
 (('accepting', 'cryptocurrency'), 1)]

In [19]:
# Function token_count generates the top 10 words for a given coin
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [20]:
# Use token_count to get the top 10 words for Bitcoin
all_tokens = tokenizer(btc.text.str.cat())
token_count(all_tokens, 10)

[('char', 18),
 ('cryptocurrency', 10),
 ('El', 10),
 ('bitcoin', 9),
 ('first', 8),
 ('Salvador', 8),
 ('Bitcoin', 6),
 ('country', 6),
 ('PayPal', 5),
 ('allow', 4)]

In [21]:
# Use token_count to get the top 10 words for Ethereum
all_tokens = tokenizer(lite.text.str.cat())
token_count(all_tokens, 10)

[('char', 18),
 ('release', 7),
 ('cryptocurrency', 7),
 ('PayPal', 7),
 ('…', 6),
 ('Bitcoin', 6),
 ('fake', 5),
 ('press', 5),
 ('said', 5),
 ('Walmart', 5)]

---

### Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [22]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# Generate the Bitcoin word cloud
def wordcloud(text, title=""):
    df_cloud = WordCloud(width=500, colormap='RdYlBu').generate(text)
    plt.imshow(df_cloud)
    plt.axis("off")
    fontdict = {"fontsize": 48, "fontweight" : "bold"}
    plt.title(title, fontdict=fontdict)
    plt.show()
                

In [None]:
wordcloud(btc.text.str.cat(), title="Bitcoin Word Cloud")

In [None]:
# Generate the Ethereum word cloud
wordcloud(lite.text.str.cat(), title="Litecoin Word Cloud")

---
## 3. Named Entity Recognition

In this section, you will build a named entity recognition model for both Bitcoin and Litecoin, then visualize the tags using SpaCy.

In [None]:
import spacy
from spacy import displacy

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

---
### Bitcoin NER

In [None]:
# Concatenate all of the Bitcoin text together
all_btc_text = btc.text.str.cat()
all_btc_text

In [None]:
# Run the NER processor on all of the text
doc = nlp(all_btc_text)

# Add a title to the document
doc.user_data["title"] = "Bitcoin NER"

In [None]:
# Render the visualization
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# List all Entities
for ent in doc.ents:
    print(ent.text, ent.label_)

---

### Litecoin NER

In [None]:
# Concatenate all of the Ethereum text together
all_lite_text = lite.text.str.cat()
all_lite_text

In [None]:
# Run the NER processor on all of the text
doc = nlp(all_lite_text)

# Add a title to the document
doc.user_data["title"] = "Litecoin NER"

In [None]:
# Render the visualization
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# List all Entities
for ent in doc.ents:
    print(ent.text, ent.label_)

---