In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import re
from transformers import TrainerCallback

os.environ["WANDB_DISABLED"] = "true"

#!pip install datasets

def clean_text_list(text_list):
    # Handle None values and convert all to strings
    cleaned_list = [str(text) if text is not None else "" for text in text_list]

    # Helper function to remove wallet addresses (assuming this is what remove_wallets does)
    def remove_wallets(text):
        # This is a basic implementation - modify if your original remove_wallets was different
        # Common crypto wallet patterns (like Bitcoin/Ethereum addresses)
        wallet_pattern = r'0x[a-fA-F0-9]{40}|[13][a-km-zA-HJ-NP-Z1-9]{25,34}'
        return re.sub(wallet_pattern, '', text)

    # Apply cleaning operations
    def clean_text(text):
        # Remove Asian characters
        text = re.sub(r'[\u4e00-\u9fff]+', '', text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        # Remove mentions, hashtags, stock symbols, and forward slashes with content
        #text = re.sub(r'[@][A-Za-z0-9_]+|#[A-Za-z0-9_]+|\$[A-Za-z0-9_ ]+|/[A-Za-z0-9_ ]+', '', text)
        # Remove RT prefix
        text = re.sub(r'RT : ', '', text)
        # Replace & with 'and'
        text = re.sub(r'&', 'and', text)
        # Handle special characters and quotes
        text = re.sub(r'â€™', '\'', text)
        text = re.sub(r'["&;]', '', text)
        text = re.sub(r'', '', text)  # Zero-width space
        # Remove .X or .x
        text = re.sub(r'\.[Xx]', '', text)
        # Normalize multiple dots to ellipsis
        text = re.sub(r'\.\.+', '...', text)
        # Remove standalone @ and pipe symbols
        text = re.sub(r'@|\|', '', text)
        # Normalize spaces
        text = re.sub(r'\s+', ' ', text).strip()
        # Convert to lowercase
        text = text.lower()
        # Remove wallet addresses
        text = remove_wallets(text)
        return text

    # Apply cleaning to all texts
    cleaned_list = [clean_text(text) for text in cleaned_list]

    # # Remove duplicates and filter by minimum word count (4 words)
    # seen = set()
    # result = []
    # for text in cleaned_list:
    #     if text and text not in seen and len(text.split()) >= 4:
    #         seen.add(text)
    #         result.append(text)

    return cleaned_list

def sentiment_map(text):
  if 'Bullish' in text:
    return 0
  elif 'Neutral' in text:
    return 1
  else:
    return 2

In [2]:
data = load_dataset("StephanAkkerman/financial-tweets-crypto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
train_dataset_ori = data['train']
train_dataset_ori = train_dataset_ori.filter(lambda data: data['sentiment'] is not None)


In [4]:
# 2. Prepare the data
#sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}  # Adjust based on your actual sentiment values

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

model_name = "ElKulako/cryptobert"

# 3. Initialize tokenizer
#tokenizer = BertTokenizer.from_pretrained('ElKulako/cryptobert')
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# 4. Prepare texts and labels with cleaning
texts = clean_text_list(train_dataset_ori['description'])#[str(t) if t is not None else "" for t in train_dataset['description']]  # Convert None to empty string and ensure all are strings
labels = [sentiment_map(sent_label) for sent_label in train_dataset_ori['sentiment']]  # Default to neutral if unknown

# 5. Tokenize the texts
# Filter out empty strings and keep track of valid indices
# valid_texts = [t for t in texts if t.strip()]
# valid_labels = [labels[i] for i, t in enumerate(texts) if t.strip()]
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# 6. Create dataset
tweet_dataset = TweetDataset(encodings, labels)

# 7. Split into train and validation (80-20 split)
train_size = int(0.8 * len(tweet_dataset))
val_size = len(tweet_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(tweet_dataset, [train_size, val_size])

# 8. Initialize model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 9. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    eval_strategy="steps",
    save_strategy="no",
    # load_best_model_at_end=True,
    # eval_accumulation_steps=1,
    report_to=None,          # Disable external logging (e.g., WANDB),
    logging_steps=0.3,
    log_level='info',
    # prediction_loss_only=True
)

# 10. Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = (preds == labels).mean()
    return {'accuracy': accuracy}

# 11. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 12. Train the model
trainer.train()

# 13. Evaluate on test set
test_results = trainer.evaluate(val_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


# # 13. Save the model
model.save_pretrained("./trained_bert_model")
tokenizer.save_pretrained("./trained_bert_model")

# print("Training completed! Model saved to './trained_bert_model'")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Safetensors PR exists
***** Running training *****
  Num examples = 38,953
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 915
  Number of trainable parameters = 124,647,939


Step,Training Loss,Validation Loss,Accuracy
275,0.7161,0.592122,0.743608
550,0.532,0.58642,0.752747
825,0.4155,0.624481,0.759626



***** Running Evaluation *****
  Num examples = 9739
  Batch size = 128

***** Running Evaluation *****
  Num examples = 9739
  Batch size = 128

***** Running Evaluation *****
  Num examples = 9739
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)



***** Running Evaluation *****
  Num examples = 9739
  Batch size = 128


Configuration saved in ./trained_bert_model/config.json



Test set evaluation results:
Test accuracy: 0.7594
Test loss: 0.6205


Model weights saved in ./trained_bert_model/model.safetensors
tokenizer config file saved in ./trained_bert_model/tokenizer_config.json
Special tokens file saved in ./trained_bert_model/special_tokens_map.json


('./trained_bert_model/tokenizer_config.json',
 './trained_bert_model/special_tokens_map.json',
 './trained_bert_model/vocab.json',
 './trained_bert_model/merges.txt',
 './trained_bert_model/added_tokens.json',
 './trained_bert_model/tokenizer.json')