# Import Libraries and Dependencies

In [None]:
import os
import pandas as pd
%matplotlib inline
from dotenv import load_dotenv
from newsapi.newsapi_client import NewsApiClient
import nltk

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download("vader_lexicon")

analyzer = SentimentIntensityAnalyzer()

# News Headlines Sentiment

### This section of the workbook analyzies the sentiment of the news headlines for Bitcoin and Ethereum.

In [None]:
# Read your api key environment variable
load_dotenv('C:/Users/markf/Desktop/Fintech SMU/API_Keys/.env')

# Set News API Key
api_key = os.getenv("NEWS_API")


In [None]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)


In [None]:
# Defined function to fetch 20 news articles according to topic by relevancy and transform informtion to datafram and filter by article content
def get_articles_df(topic):
    news = newsapi.get_everything(q=topic, language="en", page_size = 20, sort_by = 'relevancy')
    
    articles = []
    for article in news['articles']:
        try:
            title = article["title"]
            description = article["description"]
            text = article["content"]
            date = article["publishedAt"][:10]

            articles.append({
                "title": title,
                "description": description,
                "text": text,
                "date": date,
            })
        except AttributeError:
            pass

    return pd.DataFrame(articles)
    
    return df

In [None]:
# Fetch the Bitcoin news articles
bitcoin_df = get_articles_df('bitcoin').dropna()
bitcoin_df.head()

In [None]:
# Fetch the Ethereum news articles
ethereum_df = get_articles_df('ethereum').dropna()
ethereum_df.head()

In [None]:
# Sentiment calculation based on compound score
def get_sentiment(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.05:  # Positive
        result = 1
    elif score <= -0.05:  # Negative
        result = -1

    return result

In [None]:
def get_sentiment_scores(df):

    text_sent = {
        "Compound": [],
        "Positive": [],
        "Neutral": [],
        "Negative": [],
        "Sentiment": [],
    }

    # Get sentiment for the text and the title
    for index, row in df.iterrows():
        try:
            text_sentiment = analyzer.polarity_scores(row["text"])
            text_sent["Compound"].append(text_sentiment["compound"])
            text_sent["Positive"].append(text_sentiment["pos"])
            text_sent["Neutral"].append(text_sentiment["neu"])
            text_sent["Negative"].append(text_sentiment["neg"])
            text_sent["Sentiment"].append(get_sentiment(text_sentiment["compound"]))
        except AttributeError:
            pass
    
    sentiment_df = pd.DataFrame(text_sent)
    
    return df.join(sentiment_df)


In [None]:
# Calculate sentiment scores of Bitcoin
bitcoin_sentiment_df = get_sentiment_scores(bitcoin_df)

# Drop unessessary columns
bitcoin_sentiment_df = bitcoin_sentiment_df.drop(columns = ['date', 'description','title'])

In [None]:
# View final dataframe
bitcoin_sentiment_df.head()

In [None]:
# Create the ethereum sentiment scores DataFrame
ethereum_sentiment_df = get_sentiment_scores(ethereum_df)

In [None]:
# Drop unessessary columns
ethereum_sentiment_df = ethereum_sentiment_df.drop(columns = ['date', 'description','title'])

In [None]:
# View scores
ethereum_sentiment_df.head(100)

In [None]:
# Describe the Bitcoin Sentiment
bitcoin_sentiment_df.describe()

In [None]:
# Describe the Ethereum Sentiment
ethereum_sentiment_df.describe()

### Analysis:

Q: Which coin had the highest mean positive score?

A: Ethereum

Q: Which coin had the highest compound score?

A: Ethereum

Q. Which coin had the highest positive score?

A: Ethereum

---

# Tokenizer

This section uses NLTK and Python to tokenize the text for each coin.

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [None]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Complete the tokenizer function
def tokenizer(text):
    """returns a list of words that is lemmatized, stopworded, tokenized, and free of any non-letter characters. """
    # Create a list of the words
    # Convert the words to lowercase
    # Remove the punctuation
    # Remove the stop words
    # Lemmatize Words into root words
    lemmatizer = WordNetLemmatizer()
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean)
    return [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in set(stopwords.words('english'))]


In [None]:
# Create a new tokens column for bitcoin
bitcoin_sentiment_df['tokens'] = bitcoin_sentiment_df['text'].apply(tokenizer)


In [None]:
bitcoin_sentiment_df.head()

In [None]:
# Create a new tokens column for ethereum
ethereum_sentiment_df['tokens'] = ethereum_sentiment_df['text'].apply(tokenizer)
ethereum_sentiment_df.head()

---

# NGrams and Frequency Analysis

This section looks at the ngrams and word frequency for each coin. 

1. N-grams for N = 2. 
2. List the top 10 words for each coin. 

In [None]:
from collections import Counter
from nltk import ngrams

In [None]:
# Extract tokens from dataframe and merge into one list
bitcoin_tokens = [item for sublist in bitcoin_sentiment_df.tokens.to_list() for item in sublist]
ethereum_tokens = [item for sublist in ethereum_sentiment_df.tokens.to_list() for item in sublist]

In [None]:
# define function to look at ngrams top 20
def ngram_counts(corpus): 
    bigram_counts = Counter(ngrams(corpus, n=2))
    top_20 = dict(Counter(bigram_counts).most_common(20))
    return pd.DataFrame(list(top_20.items()), columns=['ngram', 'count'])

In [None]:
# Generate the Bitcoin N-grams where N=2
ngram_counts(bitcoin_tokens)

In [None]:
# Generate the Ethereum N-grams where N=2
ngram_counts(ethereum_tokens)

In [None]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    top = dict(Counter(tokens).most_common(N))
    
    return pd.DataFrame(list(top.items()), columns=['word', 'count'])

In [None]:
# Get the top 10 words for Bitcoin
token_count(bitcoin_tokens, 10)

In [None]:
# Get the top 10 words for Ethereum
token_count(ethereum_tokens, 10)

# Word Clouds



In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [None]:
# Generate the Bitcoin word cloud
wordcloud = WordCloud(colormap="RdYlBu").generate(" ".join(bitcoin_tokens))
plt.imshow(wordcloud)
plt.axis("off")
title_font = {"fontsize": 50, "fontweight": "bold"}
plt.title("Bitcoin Word Cloud", fontdict=title_font)
plt.show()

In [None]:
# Generate the Ethereum word cloud
wordcloud = WordCloud(colormap="RdYlBu").generate(" ".join(ethereum_tokens))
plt.imshow(wordcloud)
plt.axis("off")
title_font = {"fontsize": 50, "fontweight": "bold"}
plt.title("Ethereum Word Cloud", fontdict=title_font)
plt.show()

# Named Entity Recognition

This section builds a named entity recognition model for both coins and visualize the tags using SpaCy.

In [None]:
import spacy
from spacy import displacy

In [None]:
# Optional - download a language model for SpaCy
# !python -m spacy download en_core_web_sm

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [None]:
# Concatenate all of the bitcoin text together
bitcoin_text = " ".join(bitcoin_df.text.to_list())
bitcoin_text

In [None]:
# Run the NER processor on all of the text
bitcoin_doc = nlp(bitcoin_text)

# Add a title to the document
bitcoin_doc.user_data["title"] = "Bitcoin NER"


In [None]:
# Render the visualization
displacy.render(bitcoin_doc, style='ent')

In [None]:
# List all Entities
for ent in bitcoin_doc.ents:
    print(ent.text, ent.label_)

---

## Ethereum NER

In [None]:
# Concatenate all of the ethereum text together
ethereum_text = " ".join(ethereum_df.text.to_list())
ethereum_text

In [None]:
# Run the NER processor on all of the text
ethereum_doc = nlp(ethereum_text)

# Add a title to the document
ethereum_doc.user_data["title"] = "Ethereum NER"


In [None]:
# Render the visualization
displacy.render(ethereum_doc, style='ent')

In [None]:
# List all Entities
for ent in ethereum_doc.ents:
    print(ent.text, ent.label_)