In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import re
from sklearn.metrics import f1_score


In [1]:
def clean_text_list(text_list):
    # Handle None values and convert all to strings
    cleaned_list = [str(text) if text is not None else "" for text in text_list]

    # Helper function to remove wallet addresses (assuming this is what remove_wallets does)
    def remove_wallets(text):
        # This is a basic implementation - modify if your original remove_wallets was different
        # Common crypto wallet patterns (like Bitcoin/Ethereum addresses)
        wallet_pattern = r'0x[a-fA-F0-9]{40}|[13][a-km-zA-HJ-NP-Z1-9]{25,34}'
        return re.sub(wallet_pattern, '', text)

    # Apply cleaning operations
    def clean_text(text):
        # Remove Asian characters
        text = re.sub(r'[\u4e00-\u9fff]+', '', text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        # Remove mentions, hashtags, stock symbols, and forward slashes with content
        #text = re.sub(r'[@][A-Za-z0-9_]+|#[A-Za-z0-9_]+|$[A-Za-z0-9_ ]+|/[A-Za-z0-9_ ]+', '', text)
        # Remove RT prefix
        text = re.sub(r'RT : ', '', text)
        # Replace & with 'and'
        text = re.sub(r'&', 'and', text)
        # Handle special characters and quotes
        text = re.sub(r'â€™', '\'', text)
        text = re.sub(r'["&;]', '', text)
        text = re.sub(r'', '', text)  # Zero-width space
        # Remove .X or .x
        text = re.sub(r'\.[Xx]', '', text)
        # Normalize multiple dots to ellipsis
        text = re.sub(r'\.\.+', '...', text)
        # Remove standalone @ and pipe symbols
        text = re.sub(r'@|\|', '', text)
        # Normalize spaces
        text = re.sub(r'\s+', ' ', text).strip()
        # Convert to lowercase
        text = text.lower()
        # Remove wallet addresses
        text = remove_wallets(text)
        return text

    # Apply cleaning to all texts
    cleaned_list = [clean_text(text) for text in cleaned_list]

    # # Remove duplicates and filter by minimum word count (4 words)
    # seen = set()
    # result = []
    # for text in cleaned_list:
    #     if text and text not in seen and len(text.split()) >= 4:
    #         seen.add(text)
    #         result.append(text)

    return cleaned_list

def sentiment_map(text):
  if 'Bullish' in text:
    return 0
  elif 'Neutral' in text:
    return 1
  else:
    return 2



In [6]:
# Load Dataset
data = load_dataset("StephanAkkerman/financial-tweets-crypto")
train_dataset_ori = data['train']
train_dataset_ori = train_dataset_ori.filter(lambda data: data['sentiment'] is not None)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

crypto.csv:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/57935 [00:00<?, ? examples/s]

Filter:   0%|          | 0/57935 [00:00<?, ? examples/s]

In [7]:
# Tokenization & Dataset Preparation
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Process dataset
texts = clean_text_list(train_dataset_ori['description'])
labels = [sentiment_map(sent) for sent in train_dataset_ori['sentiment']]
dataset = TweetDataset(texts, labels, tokenizer)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [30]:
# Define BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out[:, -1, :])
        return self.fc(lstm_out)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 256
hidden_dim = 512
output_dim = 3
num_layers = 4
dropout = 0.5

model = BiLSTM(vocab_size, embed_dim, hidden_dim, output_dim, num_layers, dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [27]:
# Training Loop
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct = 0, 0
    for batch in train_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(train_loader), total_correct / len(train_loader.dataset)

# Evaluation Loop
def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss, total_correct = 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = outputs.argmax(1)
            total_correct += (outputs.argmax(1) == labels).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = total_correct / len(val_loader.dataset)
    f1 = f1_score(all_labels, all_preds, average="weighted")  # Weighted F1-score
    return total_loss / len(val_loader), accuracy, f1

In [24]:
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {num_trainable_params}")

Total trainable parameters: 29869571


In [25]:
print(f"Number of training samples: {len(train_dataset)}")


Number of training samples: 38953


In [31]:
# Train Model
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")



Epoch 1: Train Loss: 0.9475, Train Acc: 0.5973, Val Loss: 0.8784, Val Acc: 0.6387, Val F1: 0.5420
Epoch 2: Train Loss: 0.8817, Train Acc: 0.6190, Val Loss: 0.8386, Val Acc: 0.6487, Val F1: 0.5764
Epoch 3: Train Loss: 0.8170, Train Acc: 0.6531, Val Loss: 0.8013, Val Acc: 0.6530, Val F1: 0.5911
Epoch 4: Train Loss: 0.7586, Train Acc: 0.6809, Val Loss: 0.7487, Val Acc: 0.6857, Val F1: 0.6085
Epoch 5: Train Loss: 0.7124, Train Acc: 0.7018, Val Loss: 0.7553, Val Acc: 0.6885, Val F1: 0.6161
Epoch 6: Train Loss: 0.6682, Train Acc: 0.7225, Val Loss: 0.7394, Val Acc: 0.6901, Val F1: 0.6273
Epoch 7: Train Loss: 0.5729, Train Acc: 0.7561, Val Loss: 0.6708, Val Acc: 0.7200, Val F1: 0.7040
Epoch 8: Train Loss: 0.4597, Train Acc: 0.8159, Val Loss: 0.7314, Val Acc: 0.7191, Val F1: 0.7217
Epoch 9: Train Loss: 0.3553, Train Acc: 0.8646, Val Loss: 0.7278, Val Acc: 0.7350, Val F1: 0.7313
Epoch 10: Train Loss: 0.2626, Train Acc: 0.9078, Val Loss: 0.8410, Val Acc: 0.7230, Val F1: 0.7267
