In [1]:
import os

os.chdir("../")

from dotenv import load_dotenv

%load_ext autoreload
%autoreload 2

load_dotenv(".env")

True

In [2]:
import black
import jupyter_black

jupyter_black.load(
    lab=False,
    line_length=79,
    verbosity="INFO",
    target_version=black.TargetVersion.PY310,
)

<IPython.core.display.Javascript object>

In [3]:
from datetime import datetime
import http.client, urllib.parse
import pytz
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import constants

%load_ext autoreload
%autoreload 2

TEXT_COLOR = "#313131"
# Plotly colors
LINE_COLORS = [
    "#636EFA",
    "#EF553B",
    "#00CC96",
    "#AB63FA",
    "#FFA15A",
    "#19D3F3",
    "#FF6692",
    "#B6E880",
    "#FF97FF",
    "#FECB52",
]

sns.set(
    style="darkgrid",
    rc={
        "figure.figsize": (6, 4),
        "figure.dpi": 100,
        "figure.facecolor": "w",
        "legend.facecolor": "w",
        "text.color": TEXT_COLOR,
        "font.family": "Microsoft Sans Serif",
        "axes.labelcolor": TEXT_COLOR,
        "xtick.color": TEXT_COLOR,
        "ytick.color": TEXT_COLOR,
    },
)

sns.set_palette(sns.color_palette(LINE_COLORS))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load news data

In [4]:
from src.dataloaders import MediaStackNewsScraper

In [55]:
scraper = MediaStackNewsScraper(keywords=["bitcoin"])

In [56]:
start = scraper.str_to_timestamp("2023-05-01 00:00:00")
end = scraper.str_to_timestamp("2023-05-02 00:00:00")

In [78]:
data = scraper.get_data_for_date("2023-05-03")

In [78]:
data2 = scraper.get_data_for_date("2023-05-04")

In [76]:
backup = data

In [82]:
data = scraper.load_data(start, end)

In [85]:
df = pd.DataFrame(data)

In [88]:
df["published_at"] = pd.to_datetime(df["published_at"])

0    2023-05-01 19:00:18+00:00
1    2023-05-01 04:50:09+00:00
2    2023-05-01 01:40:17+00:00
3    2023-05-01 20:33:17+00:00
4    2023-05-01 07:00:10+00:00
                ...           
76   2023-05-02 12:04:02+00:00
77   2023-05-02 00:14:13+00:00
78   2023-05-02 08:00:57+00:00
79   2023-05-02 11:44:23+00:00
80   2023-05-02 23:59:11+00:00
Name: published_at, Length: 81, dtype: datetime64[ns, UTC]

# Load pretrained classifier

In [109]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [110]:
from collections.abc import Iterable

In [111]:
def classify_text(texts, batch_size=32):
    if not isinstance(texts, Iterable) or isinstance(texts, str):
        texts = [texts]
    probs = None
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch = texts[start:end]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        batch_probs = outputs.logits.softmax(dim=-1)
        batch_probs = batch_probs.detach().numpy()
        batch_probs *= (0, 0.5, 1)
        if probs is None:
            probs = batch_probs
        else:
            probs = np.append(probs, batch_probs, axis=0)
    return probs.sum(axis=1)

In [13]:
df["description_sentiment"] = df["description"].apply(classify_text)
df["description_sentiment"] = df["description_sentiment"].apply(lambda x: x[0])

In [14]:
df["title_sentiment"] = df["title"].apply(classify_text)
df["title_sentiment"] = df["title_sentiment"].apply(lambda x: x[0])

In [15]:
df["sentiment"] = (df["title_sentiment"] + df["description_sentiment"]) / 2

In [17]:
df

Unnamed: 0,author,title,description,url,source,image,category,language,country,published_at,description_sentiment,title_sentiment,sentiment
0,Cointelegraph,Just Bitcoin or diversify? 5 cryptocurrencies ...,Just Bitcoin or diversify? 5 cryptocurrencies ...,https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,,business,en,us,2023-05-01T19:00:18+00:00,0.499998,0.499998,0.499998
1,Akshay Chinchalkar,Bitcoin Sags After Its Longest Streak Of Month...,Bitcoin Sags After Its Longest Streak Of Month...,https://www.bqprime.com/crypto/bitcoin-btc-sag...,Bloomberg | Latest And Live Business,,business,en,us,2023-05-01T04:50:09+00:00,0.002911,0.002911,0.002911
2,Cointelegraph,"Visa stablecoin plan, debt’s ceiling effect on...","Visa stablecoin plan, debt’s ceiling effect on...",https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,https://d1-invdn-com.investing.com/content/pic...,business,en,us,2023-05-01T01:40:17+00:00,0.49985,0.49985,0.49985
3,,MicroStrategy Q1 earnings beat as bitcoin impa...,MicroStrategy Q1 earnings beat as bitcoin impa...,https://seekingalpha.com/news/3962983-microstr...,Seeking Alpha,,business,en,us,2023-05-01T20:33:17+00:00,0.999648,0.999648,0.999648
4,GlobeNewswire,Bitfarms Regains Compliance with Nasdaq Contin...,This news release constitutes a “designated ne...,https://financialpost.com/globe-newswire/bitfa...,Financial Post | Canada Business News,,business,en,us,2023-05-01T09:00:55+00:00,0.499985,0.96823,0.734107
5,Cointelegraph,Jack Dorsey’s nano Bitcoin mining chip heads t...,Jack Dorsey’s nano Bitcoin mining chip heads t...,https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,https://i-invdn-com.investing.com/news/459eb40...,business,en,us,2023-05-01T07:00:10+00:00,0.49999,0.49999,0.49999
6,Business Wire,Cathedra Bitcoin Announces Fourth Quarter and ...,"TORONTO &#8212; (Block Height: 787,600) – Cath...",https://financialpost.com/pmn/press-releases-p...,Financial Post | Canada Business News,,business,en,us,2023-05-01T11:01:15+00:00,0.999756,0.500019,0.749887
7,Cointelegraph,"Visa stablecoin plan, debt ceiling’s effect on...","Visa stablecoin plan, debt ceiling’s effect on...",https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,https://d1-invdn-com.investing.com/content/pic...,business,en,us,2023-05-01T07:40:26+00:00,0.499919,0.499919,0.499919
8,Bitcoin News Editor,MicroStrategy Spent $179 Million on Bitcoin La...,"MicroStrategy, a software intelligence firm, b...",https://forextv.com/bitcoin-news/microstrategy...,forextv,,general,en,us,2023-05-01T22:29:20+00:00,0.023199,0.500283,0.261741
9,Bloomberg,Bitcoin’s rise comes to a halt,Bitcoin slipped back Monday after climbing for...,https://mybroadband.co.za/news/cryptocurrency/...,mybroadband,,general,en,za,2023-05-01T05:09:23+00:00,0.002475,0.035262,0.018868


In [16]:
for i, row in df[["title", "description", "sentiment"]].iterrows():
    print("\n")
    print(f"Sentiment: {round(row['sentiment'], 2)}")
    print(row["title"])
    # print(row["description"])
    print(30 * "= ")



Sentiment: 0.5
Just Bitcoin or diversify? 5 cryptocurrencies to watch in the next few days
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.0
Bitcoin Sags After Its Longest Streak Of Monthly Gains Since 2021
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Visa stablecoin plan, debt’s ceiling effect on Bitcoin price: Hodler’s Digest, April 23-29
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 1.0
MicroStrategy Q1 earnings beat as bitcoin impairment eases
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.73
Bitfarms Regains Compliance with Nasdaq Continued Listing Requirements
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Jack Dorsey’s nano Bitcoin mining chip heads to prototype
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.75
Cathedra Bitcoin Announces Fourth Quarter and Full-Year 2022 Financial Results
= = = = = = =

---

# pygooglenews

In [102]:
from pygooglenews import GoogleNews

In [103]:
from datetime import datetime
from time import mktime
import pytz


def parse_struct_time(struct_time_obj):
    dt = datetime.fromtimestamp(mktime(struct_time_obj), tz=pytz.utc)
    return dt.strftime("%Y-%m-%d %H:%M:%S")

In [104]:
gn = GoogleNews(lang="en", country="US")

In [105]:
result = gn.search(
    query="bitcoin",
    #         when="1h",
    from_="2023-01-01",
    to_="2023-01-02",
)

https://news.google.com/rss/search?q=bitcoin+after%3A2023-01-01+before%3A2023-01-02&ceid=US:en&hl=en&gl=US


In [106]:
data = {
    "title": [entry["title"] for entry in result["entries"]],
    "time": [
        parse_struct_time(entry["published_parsed"]) for entry in result["entries"]
    ],
}

In [107]:
df = pd.DataFrame(data)

In [112]:
df["sentiment"] = df["title"].apply(classify_text).apply(lambda x: x[0])

In [116]:
df

Unnamed: 0,title,time,sentiment
0,These 4 altcoins may attract buyers with Bitco...,2023-01-01 07:00:00,0.002758
1,What Will It Take for Bitcoin Mining Companies...,2023-01-01 07:00:00,0.499974
2,The Fight Against Bitcoin Starts In 2023 - Bit...,2023-01-01 07:00:00,0.49999
3,Bitcoin Starts the New Year Off On the Wrong F...,2023-01-01 07:00:00,0.499979
4,"Markets: Bitcoin, Ethereum up, Cardano leads g...",2023-01-01 07:00:00,0.999721
5,Veteran Bitcoin Developer: Keys to $3.6M in BT...,2023-01-02 07:00:00,0.500026
6,How Bitcoin Price Could Hit $23 Million If $BT...,2023-01-01 07:00:00,0.998785
7,The centralization of Bitcoin: Behind the two ...,2023-01-01 07:00:00,0.499984
8,Valkyrie Reveals Intention to Sponsor Grayscal...,2023-01-01 07:00:00,0.500005
9,Rich Dad Poor Dad Author Robert Kiyosaki is Bu...,2023-01-01 07:00:00,0.500016


In [114]:
# Title lengths, seems to be capped
[len(t) for t in df["title"]]

[75, 77, 59, 61, 66, 72, 83, 112, 85, 83, 89]