In [1]:
import os

os.chdir("../")

from dotenv import load_dotenv

%load_ext autoreload
%autoreload 2

load_dotenv(".env")

True

In [2]:
from datetime import datetime
import http.client, urllib.parse
import pytz
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import constants

TEXT_COLOR = "#313131"
# Plotly colors
LINE_COLORS = [
    "#636EFA",
    "#EF553B",
    "#00CC96",
    "#AB63FA",
    "#FFA15A",
    "#19D3F3",
    "#FF6692",
    "#B6E880",
    "#FF97FF",
    "#FECB52",
]

sns.set(
    style="darkgrid",
    rc={
        "figure.figsize": (6, 4),
        "figure.dpi": 100,
        "figure.facecolor": "w",
        "legend.facecolor": "w",
        "text.color": TEXT_COLOR,
        "font.family": "Microsoft Sans Serif",
        "axes.labelcolor": TEXT_COLOR,
        "xtick.color": TEXT_COLOR,
        "ytick.color": TEXT_COLOR,
    },
)

sns.set_palette(sns.color_palette(LINE_COLORS))

# Create dataset

In [4]:
df = pd.read_parquet("data/binance")

In [3]:
import torch

In [4]:
from src.model.datasets import SlidingWindowDataset

In [18]:
x = torch.arange(10).to(torch.long).unsqueeze(1)
y = torch.arange(11, 21).to(torch.long)

In [19]:
ds = SlidingWindowDataset(x, y, seq_len=5)

In [30]:
x_1_expected = torch.Tensor(
    [
        [1],
        [2],
        [3],
        [4],
        [5],
    ]
).to(torch.long)

assert torch.equal(ds.x[1], x_1_expected)

tensor([[1],
        [2],
        [3],
        [4],
        [5]])

True

In [29]:
assert all(y_expected == ds.x[1])

In [14]:
ds[0]

(tensor([[0],
         [1],
         [2],
         [3],
         [4]]),
 tensor(15))

In [50]:
scraper = MediaStackNewsScraper(
    keywords=["bitcoin"],
    news_client=MediaStackClient(access_key=ENV["MEDIASTACK_ACCESS_KEY"])
)

In [51]:
start = scraper.str_to_timestamp("2023-05-01")
end = scraper.str_to_timestamp("2023-05-02")

In [52]:
data = scraper.load_data(start, end)

In [53]:
df = pd.DataFrame(data)

In [54]:
df["published_at"] = pd.to_datetime(df["published_at"])

# Load pretrained classifier

In [55]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [56]:
from collections.abc import Iterable

In [57]:
def classify_text(texts, batch_size=32):
    if not isinstance(texts, Iterable) or isinstance(texts, str):
        texts = [texts]
    probs = None
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch = texts[start:end]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        batch_probs = outputs.logits.softmax(dim=-1)
        batch_probs = batch_probs.detach().numpy()
        batch_probs *= (0, 0.5, 1)
        if probs is None:
            probs = batch_probs
        else:
            probs = np.append(probs, batch_probs, axis=0)
    return probs.sum(axis=1)

In [58]:
df["description_sentiment"] = df["description"].apply(classify_text)
df["description_sentiment"] = df["description_sentiment"].apply(lambda x: x[0])

In [59]:
df["title_sentiment"] = df["title"].apply(classify_text)
df["title_sentiment"] = df["title_sentiment"].apply(lambda x: x[0])

In [60]:
df["sentiment"] = (df["title_sentiment"] + df["description_sentiment"]) / 2

In [61]:
df

Unnamed: 0,author,title,description,url,source,image,category,language,country,published_at,description_sentiment,title_sentiment,sentiment
0,Akshay Chinchalkar,Bitcoin Sags After Its Longest Streak Of Month...,Bitcoin Sags After Its Longest Streak Of Month...,https://www.bqprime.com/crypto/bitcoin-btc-sag...,Bloomberg | Latest And Live Business,,business,en,us,2023-05-01 04:50:09+00:00,0.002911,0.002911,0.002911
1,Cointelegraph,Just Bitcoin or diversify? 5 cryptocurrencies ...,Just Bitcoin or diversify? 5 cryptocurrencies ...,https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,,business,en,us,2023-05-01 19:00:18+00:00,0.499998,0.499998,0.499998
2,Cointelegraph,"Visa stablecoin plan, debt’s ceiling effect on...","Visa stablecoin plan, debt’s ceiling effect on...",https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,https://d1-invdn-com.investing.com/content/pic...,business,en,us,2023-05-01 01:40:17+00:00,0.499850,0.499850,0.499850
3,,MicroStrategy Q1 earnings beat as bitcoin impa...,MicroStrategy Q1 earnings beat as bitcoin impa...,https://seekingalpha.com/news/3962983-microstr...,Seeking Alpha,,business,en,us,2023-05-01 20:33:17+00:00,0.999648,0.999648,0.999648
4,Cointelegraph,Jack Dorsey’s nano Bitcoin mining chip heads t...,Jack Dorsey’s nano Bitcoin mining chip heads t...,https://www.investing.com/news/cryptocurrency-...,Investing.com | Stock Market Quotes &amp; Fina...,https://i-invdn-com.investing.com/news/459eb40...,business,en,us,2023-05-01 07:00:10+00:00,0.499990,0.499990,0.499990
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,,Bitcoin Well Reports 2023 Q1 Financial Results...,"(marketscreener.com) EDMONTON, Alberta, May 02...",https://www.marketscreener.com/quote/stock/BIT...,4-traders,,general,en,us,2023-05-02 12:04:02+00:00,0.501102,0.500007,0.500555
77,Bitcoin News Editor,First Mover Asia: Bitcoin Market Cap Is Surgin...,PLUS: Western crypto innovators with great ide...,https://forextv.com/bitcoin-news/first-mover-a...,forextv,,general,en,us,2023-05-02 00:14:13+00:00,0.987728,0.147956,0.567842
78,Sponsored Content,Best Crypto Casinos for High Rollers (2023): T...,We’ve ranked the top crypto online casinos for...,https://www.mercurynews.com/2023/05/02/high-ro...,mercurynews,https://www.mercurynews.com/wp-content/uploads...,general,en,us,2023-05-02 08:00:57+00:00,0.500035,0.500051,0.500043
79,Bitcoin News Editor,MicroStrategy ploughs ahead with bitcoin strategy,Bitcoin (BTC) uber-bull Michael Saylor’s softw...,https://forextv.com/bitcoin-news/microstrategy...,forextv,,general,en,us,2023-05-02 11:44:23+00:00,0.999514,0.501324,0.750419


In [62]:
for i, row in df[["title", "description", "sentiment"]].iterrows():
    print("\n")
    print(f"Sentiment: {round(row['sentiment'], 2)}")
    print(row["title"])
    # print(row["description"])
    print(30 * "= ")



Sentiment: 0.0
Bitcoin Sags After Its Longest Streak Of Monthly Gains Since 2021
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Just Bitcoin or diversify? 5 cryptocurrencies to watch in the next few days
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Visa stablecoin plan, debt’s ceiling effect on Bitcoin price: Hodler’s Digest, April 23-29
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 1.0
MicroStrategy Q1 earnings beat as bitcoin impairment eases
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Jack Dorsey’s nano Bitcoin mining chip heads to prototype
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.73
Bitfarms Regains Compliance with Nasdaq Continued Listing Requirements
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 


Sentiment: 0.5
Visa stablecoin plan, debt ceiling’s effect on Bitcoin price: Hodler’s Digest, April 23-29
= 

---

# pygooglenews

In [35]:
from pygooglenews import GoogleNews

In [36]:
from datetime import datetime
from time import mktime
import pytz


def parse_struct_time(struct_time_obj):
    dt = datetime.fromtimestamp(mktime(struct_time_obj), tz=pytz.utc)
    return dt.strftime("%Y-%m-%d %H:%M:%S")

In [37]:
gn = GoogleNews(lang="en", country="US")

In [38]:
result = gn.search(
    query="bitcoin",
    #         when="1h",
    from_="2023-01-01",
    to_="2023-01-02",
)

https://news.google.com/rss/search?q=bitcoin+after%3A2023-01-01+before%3A2023-01-02&ceid=US:en&hl=en&gl=US


In [39]:
data = {
    "title": [entry["title"] for entry in result["entries"]],
    "time": [
        parse_struct_time(entry["published_parsed"]) for entry in result["entries"]
    ],
}

In [40]:
df = pd.DataFrame(data)

In [41]:
df["sentiment"] = df["title"].apply(classify_text).apply(lambda x: x[0])

In [42]:
df

Unnamed: 0,title,time,sentiment
0,These 4 altcoins may attract buyers with Bitco...,2023-01-01 07:00:00,0.002758
1,"Bitcoin Could Easily Rally To $160,000 Says Cr...",2023-01-01 07:00:00,0.999603
2,The Fight Against Bitcoin Starts In 2023 - Bit...,2023-01-01 07:00:00,0.49999
3,Bitcoin Bullish This Year? Popular Crypto Stra...,2023-01-01 07:00:00,0.963818
4,What Will It Take for Bitcoin Mining Companies...,2023-01-01 07:00:00,0.499974
5,"Markets: Bitcoin, Ethereum up, Cardano leads g...",2023-01-01 07:00:00,0.999721
6,Veteran Bitcoin Developer: Keys to $3.6M in BT...,2023-01-02 07:00:00,0.500026
7,The centralization of Bitcoin: Behind the two ...,2023-01-01 07:00:00,0.499984
8,Big Eyes Coin on the Verge of Breakthrough As ...,2023-01-01 07:00:00,0.500005
9,Rich Dad Poor Dad Author Robert Kiyosaki is Bu...,2023-01-01 07:00:00,0.500016


In [43]:
# Title lengths, seems to be capped
[len(t) for t in df["title"]]

[75, 105, 59, 114, 77, 66, 72, 112, 89, 83, 85, 80, 83, 81, 64, 79]