In [1]:
import nltk
from nltk import word_tokenize
import pandas as pd
import numpy as np

In [2]:
def tokenize(text):
    words = word_tokenize(text.lower())
    interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    words = [word for word in words if word not in interpunctuations]  
    return words

In [3]:
df = pd.read_csv("price_vol.csv")
df.rename(columns={df.columns[0]: 'time'}, inplace=True)
df = df.loc[:, ['time', 'BTC-close']]
df['time'] = pd.to_datetime(df['time'].str[:19])
df.set_index('time', inplace=True)
df['BTC-return'] = df['BTC-close'].pct_change()

In [4]:
def transfer(s):
    return np.nan if s == '[[], [], []]' else s

df1 = pd.read_csv('news.csv')
df1.rename(columns={df1.columns[0]: 'time'}, inplace=True)
df1['time'] = pd.to_datetime(df1['time'])
df1.set_index('time', inplace=True)
df1 = df1[['BTC']]
df1.columns = ['news']
df1['news'] = df1['news'].apply(transfer)
df = df1.shift(-1).merge(df[['BTC-return']], left_index=True, right_index=True, how='inner')

In [5]:
idx = int(len(df) * 0.92)
split_time = df.index[idx]
train = df.iloc[:idx].dropna()
test = df.iloc[idx:].ffill()
X_train = np.array(train['news'])
y_train = (train['BTC-return'] > 0).astype(int)
X_test = np.array(test['news'])
y_test = (test['BTC-return'] > 0).astype(int)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer   

vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

np.array(X_train)

array(<3692x21027 sparse matrix of type '<class 'numpy.int64'>'
	with 379789 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [7]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_predict = mnb.predict(X_test)

In [8]:
y_predict

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,

In [9]:
y_test

time
2022-09-08 09:00:00    0
2022-09-08 10:00:00    1
2022-09-08 11:00:00    1
2022-09-08 12:00:00    0
2022-09-08 13:00:00    1
                      ..
2022-09-24 20:00:00    1
2022-09-24 21:00:00    0
2022-09-24 22:00:00    0
2022-09-24 23:00:00    1
2022-09-25 00:00:00    1
Name: BTC-return, Length: 400, dtype: int64

In [10]:
assert len(y_test) == len(y_predict)
cnt = 0
for i in range(len(test)):
    if y_predict[i] == y_test[i]:
        cnt += 1
cnt / len(test)

0.465

In [12]:
df.dropna()

Unnamed: 0_level_0,news,BTC-return
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-01 12:00:00,[['Geopolitical Risk Returns for Global Market...,0.023116
2022-03-02 02:00:00,"[['Asian shares slip, oil surges again as Russ...",0.003759
2022-03-02 07:00:00,"[['Business Highlights: Lobbyists leaving, rat...",-0.000733
2022-03-02 08:00:00,"[['Business Highlights: Lobbyists leaving, rat...",0.000519
2022-03-02 10:00:00,"[['Millions for Crypto Start-Ups, No Real Name...",0.004599
...,...,...
2022-09-24 13:00:00,"[[""3 Red Flags for Block's Future"", 'How crypt...",-0.000711
2022-09-24 14:00:00,"[[""'We Do Not View This As Proprietary Trading...",-0.001582
2022-09-24 15:00:00,"[['How Has The Rise And Fall Of Dogecoin, Shib...",-0.000710
2022-09-24 17:00:00,[['The biggest Bitcoin fund just hit a record ...,-0.001375
