In [68]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import get_scheduler

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
from sklearn.metrics import precision_score, accuracy_score, recall_score

In [69]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# New Section

In [70]:
import warnings
warnings.filterwarnings('ignore')

In [71]:
torch.cuda.is_available()

True

In [72]:
class simpleProcessor():
    
    def __init__(self, priceFile, newsFile, tickerName, priceType):
        self.tickerName = tickerName
        self.priceType = priceType
        self.news_data = pd.read_csv(newsFile, index_col = [0])
        self.price_vol = pd.read_csv(priceFile, index_col= [0])
        self.crypto_price = self.price_vol[[tickerName + '-' + priceType]].shift(1)
        self.crypto_news = self.news_data[[tickerName]]
        self.crypto_price.index = pd.to_datetime(self.crypto_price.index).tz_convert(None)
        self.crypto_news.index = pd.to_datetime(self.crypto_news.index)
        self.data = None
        
    def getData(self):
        data = self.crypto_news.merge(self.crypto_price, how = 'inner', left_index = True, right_index = True)
        data[self.tickerName + '_summary'] = data.apply(lambda x :self.joinStr(x), axis = 1)
        data['returns'] = data[self.tickerName + '-' + self.priceType].pct_change().shift(-1)
        data['class'] = data['returns'] > 0
        data = data.iloc[:-1, :]
        
        nonewsdate = data[data.isna().any(axis = 1)].index
        newsdate = data[~data.isna().any(axis = 1)].index
        
        self.data = data[~data.isna().any(axis = 1)][[self.tickerName + '_summary', 'class']]
        self.data.columns = ['text', 'class']
        self.data['class'] = self.data['class'].map({True : 1, False : 0})
        
        return self.data.copy()
        
    @staticmethod
    def joinStr(df):
        lists = eval(df[0])
        if(lists[0]):
            return ' '.join(lists[0])
        return np.nan

In [73]:
train_data1 = simpleProcessor('/content/sample_data/price_vol.csv', '/content/sample_data/btc_eth.csv', 'ETH', 'close').getData()

In [74]:
train_data1.head()

Unnamed: 0,text,class
2022-03-02 01:00:00,"Developers tap NFTs to promote projects, as pr...",0
2022-03-02 11:00:00,"Millions for Crypto Start-Ups, No Real Names N...",0
2022-03-02 12:00:00,How to Make ( and Keep ) Money in the Market...,0
2022-03-02 20:00:00,This Social Club Runs on Crypto Tokens and Vibes,1
2022-03-02 22:00:00,Bob Dylan and Miles Davis NFTs Planned as Sony...,1


# BERT

## Dataset Class Inheritance

In [75]:
class newsDataset(Dataset): 
    
    def __init__(self, df, tokenizer):
        self.labels = [i for i in df['class']]
        self.texts = [tokenizer(str(text), 
                                padding='max_length',
                                truncation = True, 
                                return_tensors='pt')
                      for text in df['text']]
        
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_text(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_text(idx)
        batch_labels = self.get_batch_labels(idx)
        
        return batch_texts, batch_labels

In [76]:
percent = 0.8
n = train_data1.shape[0]
trainData, testData = train_data1.iloc[0 : int(n*percent)], train_data1.iloc[int(n*percent):]
m = trainData.shape[0]
trainData, validData = trainData.iloc[0 : int(m*percent)], trainData.iloc[int(m*percent): ]

In [77]:
BERT_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_NAME)
trainDataLoader = DataLoader(newsDataset(trainData, tokenizer), batch_size = 16)
validDataLoader = DataLoader(newsDataset(validData, tokenizer), batch_size = 16)

## Bert Classifier 
This is a simple pre-trained one in Huggingface!

In [78]:
class BertBinaryClassifier(): 
    
    def __init__(self, bertname, lr, totalSteps):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = BertForSequenceClassification.from_pretrained(
        bertname, 
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False,
        )
        self.model = self.model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr = lr, eps = 1e-8)
        self.scheduler = get_scheduler(name = 'linear', optimizer=self.optimizer, num_warmup_steps=0, \
                                       num_training_steps=totalSteps)

In [79]:
def trainer(clf, trainLoader, validLoader, epochs):
    #epochs 
    for _ in trange(epochs):
        
        #set model to training model
        clf.model.train()
        
        total_loss = 0
        total_num = 0
        for train_inputs, train_labels in trainLoader:
            
            train_labels = train_labels.long()
            train_labels = train_labels.to(clf.device)
            mask = train_inputs['attention_mask'].to(clf.device)
            input_id = train_inputs['input_ids'].squeeze(1).to(clf.device) 
            
            clf.optimizer.zero_grad()

            output = clf.model(input_id, attention_mask = mask, labels = train_labels)
            loss = output.loss
            loss.backward()
                        
            clf.optimizer.step()
            clf.scheduler.step()
            
            total_loss += loss.item()
            total_num += input_id.size()[0]
            
        # Validation
        clf.model.eval()
        
        total_loss_val = 0
        total_num_val = 0
        preds = []
        trues = []
        with torch.no_grad():
            
            for valid_inputs, valid_labels in validLoader:
                
                valid_labels = valid_labels.long()
                valid_labels = valid_labels.to(clf.device)
                mask = valid_inputs['attention_mask'].to(clf.device)
                input_id = valid_inputs['input_ids'].squeeze(1).to(clf.device)
                
                output = clf.model(input_id, attention_mask = mask, labels = valid_labels)
                
                loss = output.loss
                total_loss_val += loss.item()
                total_num_val += input_id.size()[0]

                logits = output.logits.detach().cpu().numpy()
                predictions = np.argmax(logits, axis = 1).flatten()
                labels = valid_labels.detach().cpu().numpy().flatten()
                
                preds += list(predictions)
                trues += list(labels)
        
        
        print('\n\t - Train loss : {:.4f}'.format(total_loss / total_num))
        print('\t - Validation loss : {:.4f}'.format(total_loss_val / total_num_val))
        print('\t - Validation accuracy : {:.4f}'.format(accuracy_score(trues, preds)))
        print('\t - Validation precision : {:.4f}'.format(precision_score(trues, preds)))
        print('\t - Validation recall : {:.4f}'.format(recall_score(trues, preds)))
    


In [80]:
num_epochs = 3
clf = BertBinaryClassifier('bert-base-uncased', lr = 2e-5, totalSteps=num_epochs*len(trainDataLoader))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [81]:
trainer(clf, trainDataLoader, validDataLoader, num_epochs)

 33%|███▎      | 1/3 [03:54<07:49, 234.88s/it]


	 - Train loss : 0.0437
	 - Validation loss : 0.0445
	 - Validation accuracy : 0.4799
	 - Validation precision : 0.6667
	 - Validation recall : 0.0064


 67%|██████▋   | 2/3 [07:48<03:54, 234.20s/it]


	 - Train loss : 0.0434
	 - Validation loss : 0.0447
	 - Validation accuracy : 0.4883
	 - Validation precision : 0.5833
	 - Validation recall : 0.0675


100%|██████████| 3/3 [11:42<00:00, 234.09s/it]


	 - Train loss : 0.0421
	 - Validation loss : 0.0453
	 - Validation accuracy : 0.4916
	 - Validation precision : 0.5222
	 - Validation recall : 0.3023





In [67]:
preds

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
