In [1]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import get_scheduler

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
from sklearn.metrics import precision_score, accuracy_score, recall_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
torch.cuda.is_available()

True

In [4]:
class simpleProcessor():
    
    def __init__(self, priceFile, newsFile, tickerName, priceType):
        self.tickerName = tickerName
        self.priceType = priceType
        self.news_data = pd.read_csv(newsFile, index_col = [0])
        self.price_vol = pd.read_csv(priceFile, index_col= [0])
        self.crypto_price = self.price_vol[[tickerName + '-' + priceType]].shift(1)
        self.crypto_news = self.news_data[[tickerName]]
        self.crypto_price.index = pd.to_datetime(self.crypto_price.index).tz_convert(None)
        self.crypto_news.index = pd.to_datetime(self.crypto_news.index)
        self.data = None
        
    def getData(self):
        data = self.crypto_news.merge(self.crypto_price, how = 'inner', left_index = True, right_index = True)
        data[self.tickerName + '_summary'] = data.apply(lambda x :self.joinStr(x), axis = 1)
        data['returns'] = data[self.tickerName + '-' + self.priceType].pct_change().shift(-1)
        data['class'] = data['returns'] > 0
        data = data.iloc[:-1, :]
        
        nonewsdate = data[data.isna().any(axis = 1)].index
        newsdate = data[~data.isna().any(axis = 1)].index
        
        self.data = data[~data.isna().any(axis = 1)][[self.tickerName + '_summary', 'class']]
        self.data.columns = ['text', 'class']
        self.data['class'] = self.data['class'].map({True : 1, False : 0})
        
        return self.data.copy()
        
    @staticmethod
    def joinStr(df):
        lists = eval(df[0])
        if(lists[0]):
            return ' '.join(lists[0])
        return np.nan

In [5]:
train_data1 = simpleProcessor('price_vol.csv', 'btc_eth.csv', 'BTC', 'close').getData()

In [8]:
pd.read_csv('price_vol.csv', index_col = [0]).shift(1)

Unnamed: 0,ADA-low,ADA-high,ADA-volume,ADA-open,ADA-close,ALGO-low,ALGO-high,ALGO-volume,ALGO-open,ALGO-close,...,XRP-low,XRP-high,XRP-volume,XRP-open,XRP-close,XTZ-low,XTZ-high,XTZ-volume,XTZ-open,XTZ-close
2020-09-26 00:00:00+00:00,,,,,,,,,,,...,,,,,,,,,,
2020-09-26 01:00:00+00:00,0.096892,0.099498,0.0,0.096892,0.098289,,,,,,...,0.241879,0.244077,0.0,0.241888,0.243394,,,,,
2020-09-26 02:00:00+00:00,0.098039,0.099185,0.0,0.098294,0.099047,,,,,,...,0.243195,0.244752,14832768.0,0.243373,0.244366,,,,,
2020-09-26 03:00:00+00:00,0.096829,0.099019,0.0,0.098984,0.096829,,,,,,...,0.243415,0.244747,12609024.0,0.244331,0.244631,,,,,
2020-09-26 04:00:00+00:00,0.095597,0.096975,2912704.0,0.096785,0.095623,,,,,,...,0.244516,0.245595,13346944.0,0.244791,0.244598,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-25 00:00:00+00:00,0.451335,0.452557,0.0,0.452557,0.451659,0.379773,0.383300,0.0,0.379773,0.383300,...,0.486770,0.490115,0.0,0.487643,0.488890,1.473553,1.479516,0.0,1.476808,1.478689
2022-09-25 01:00:00+00:00,0.451490,0.454639,1261952.0,0.451560,0.454176,0.379394,0.392347,5011120.0,0.384171,0.392347,...,0.483836,0.489918,2641664.0,0.488732,0.485429,1.477692,1.492775,286996.0,1.478441,1.492775
2022-09-25 02:00:00+00:00,0.452166,0.454080,1094592.0,0.454080,0.453118,0.388882,0.395439,13752832.0,0.393055,0.389927,...,0.482285,0.485739,7360256.0,0.485426,0.482613,1.486390,1.494579,590622.0,1.494475,1.487767
2022-09-25 03:00:00+00:00,0.453011,0.454661,94528.0,0.453145,0.453903,0.384276,0.392390,2264368.0,0.390024,0.384276,...,0.478734,0.482618,14958848.0,0.482578,0.480296,1.487473,1.494783,61318.0,1.487915,1.493865


In [6]:
train_data1.head()

Unnamed: 0,text,class
2022-03-01 13:00:00,Geopolitical Risk Returns for Global Markets G...,1
2022-03-02 03:00:00,"Asian shares slip, oil surges again as Russia ...",1
2022-03-02 08:00:00,"Business Highlights: Lobbyists leaving, rate h...",1
2022-03-02 09:00:00,"Business Highlights: Lobbyists leaving, rate h...",1
2022-03-02 11:00:00,"Millions for Crypto Start-Ups, No Real Names N...",0


# BERT

## Dataset Class Inheritance

In [7]:
class newsDataset(Dataset): 
    
    def __init__(self, df, tokenizer):
        self.labels = [i for i in df['class']]
        self.texts = [tokenizer(str(text), 
                                padding='max_length',
                                truncation = True, 
                                return_tensors='pt')
                      for text in df['text']]
        
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_text(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_text(idx)
        batch_labels = self.get_batch_labels(idx)
        
        return batch_texts, batch_labels

In [8]:
percent = 0.8
n = train_data1.shape[0]
trainData, testData = train_data1.iloc[0 : int(n*percent)], train_data1.iloc[int(n*percent):]
m = trainData.shape[0]
trainData, validData = trainData.iloc[0 : int(m*percent)], trainData.iloc[int(m*percent): ]

In [9]:
BERT_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_NAME)
trainDataLoader = DataLoader(newsDataset(trainData, tokenizer), batch_size = 2)
validDataLoader = DataLoader(newsDataset(validData, tokenizer), batch_size = 2)

## Bert Classifier 
This is a simple pre-trained one in Huggingface!

In [10]:
class BertBinaryClassifier(): 
    
    def __init__(self, bertname, lr, totalSteps):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(
        bertname, 
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False,
        )
        self.model = self.model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr = lr, eps = 1e-8)
        self.scheduler = get_scheduler(name = 'linear', optimizer=self.optimizer, num_warmup_steps=0, \
                                       num_training_steps=totalSteps)

In [11]:
def trainer(clf, trainLoader, validLoader, epochs):
    #epochs 
    for _ in trange(epochs):
        
        #set model to training model
        clf.model.train()
        
        total_loss = 0
        total_num = 0
        for train_inputs, train_labels in trainLoader:
            
            train_labels = train_labels.long()
            train_labels = train_labels.to(clf.device)
            mask = train_inputs['attention_mask'].to(clf.device)
            input_id = train_inputs['input_ids'].squeeze(1).to(clf.device) 
            
            clf.optimizer.zero_grad()

            output = clf.model(input_id, attention_mask = mask, labels = train_labels)
            loss = output.loss
            loss.backward()
                        
            clf.optimizer.step()
            clf.scheduler.step()
            
            total_loss += loss.item()
            total_num += input_id.size()[0]
            
        # Validation
        clf.model.eval()
        
        total_loss_val = 0
        total_num_val = 0
        preds = []
        trues = []
        with torch.no_grad():
            
            for valid_inputs, valid_labels in validLoader:
                
                valid_labels = valid_labels.long()
                valid_labels = valid_labels.to(clf.device)
                mask = valid_inputs['attention_mask'].to(clf.device)
                input_id = valid_inputs['input_ids'].squeeze(1).to(clf.device)
                
                output = clf.model(input_id, attention_mask = mask, labels = valid_labels)
                
                loss = output.loss
                total_loss_val += loss.item()
                total_num_val += input_id.size()[0]

                logits = output.logits.detach().cpu().numpy()
                predictions = np.argmax(logits, axis = 1).flatten()
                labels = valid_labels.detach().cpu().numpy().flatten()
                
                preds += list(predictions)
                trues += list(labels)
        
        
        print('\n\t - Train loss : {:.4f}'.format(total_loss / total_num))
        print('\t - Validation loss : {:.4f}'.format(total_loss_val / total_num_val))
        print('\t - Validation accuracy : {:.4f}'.format(accuracy_score(trues, preds)))
        print('\t - Validation precision : {:.4f}'.format(precision_score(trues, preds)))
        print('\t - Validation recall : {:.4f}'.format(recall_score(trues, preds)))
    


In [12]:
num_epochs = 1
clf = BertBinaryClassifier('bert-base-uncased', lr = 2e-5, totalSteps=num_epochs*len(trainDataLoader))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
preds, trues = trainer(clf, trainDataLoader, validDataLoader, num_epochs)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [11:28<00:00, 688.98s/it]


	 - Train loss : 0.3495
	 - Validation loss : 0.3473
	 - Validation accuracy : 0.5086
	 - Validation precision : 0.6304
	 - Validation recall : 0.0887



