In [7]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [8]:
from google.colab import drive
import json
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:

MODEL_FILE = "/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model"
METRIC_FILE = "/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/training_metrics.json"

In [10]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import re
from transformers import TrainerCallback
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score
torch.manual_seed(24)
os.environ["WANDB_DISABLED"] = "true"

#!pip install datasets


def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove RT prefix
    pattern = r'^RT\s*@[\w]+:'
    # Remove the pattern and strip any extra spaces
    text = re.sub(pattern, '', text, flags=re.IGNORECASE).strip()
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def sentiment_map(text):
  if 'Bullish' in text:
    return 0
  elif 'Neutral' in text:
    return 1
  else:
    return 2

In [11]:
import torch
import numpy as np
import random

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create a generator for the split
generator = torch.Generator().manual_seed(seed)


In [12]:
data = load_dataset("StephanAkkerman/financial-tweets-crypto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

crypto.csv:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/57935 [00:00<?, ? examples/s]

In [13]:
train_dataset_ori = data['train']
print(f'No. of data: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: data['sentiment'] is not None)
print(f'No. of data after remove sentiment equals to none: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: data['tweet_type']!='quote tweet')
print(f'No. of data after remove quote tweet: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: len(data['description'].split(' '))>1)
print(f'No. of data after remove short text: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.to_pandas()

train_dataset_ori['description'] = train_dataset_ori['description'].apply(clean_text)
train_dataset_ori.drop_duplicates(inplace=True, ignore_index=True)
print(f'No. of data after remove duplicates: {len(train_dataset_ori)}')
train_dataset_ori['sentiment_label'] = train_dataset_ori['sentiment'].apply(sentiment_map)


No. of data: 57935


Filter:   0%|          | 0/57935 [00:00<?, ? examples/s]

No. of data after remove sentiment equals to none: 48692


Filter:   0%|          | 0/48692 [00:00<?, ? examples/s]

No. of data after remove quote tweet: 46866


Filter:   0%|          | 0/46866 [00:00<?, ? examples/s]

No. of data after remove short text: 45567
No. of data after remove duplicates: 45567


In [14]:
num_samples = len(train_dataset_ori)
# Create an array of indices
indices = np.arange(num_samples)

# Shuffle the indices randomly
np.random.seed(42)  # Set a seed for reproducibility
np.random.shuffle(indices)

# Split the indices into train, validation, and test sets
train_size = int(num_samples * 0.8)  # 80% for training
val_size = int(num_samples * 0.1)  # 10% for validation
test_size = num_samples - train_size - val_size  # 10% for testing

# Split the shuffled indices
train_idx = indices[:train_size]
val_idx = indices[train_size:train_size + val_size]
test_idx = indices[train_size + val_size:]

# Print the sizes of each split
print(f"Train size: {len(train_idx)}")
print(f"Validation size: {len(val_idx)}")
print(f"Test size: {len(test_idx)}")

Train size: 36453
Validation size: 4556
Test size: 4558


In [15]:
train_dataset = train_dataset_ori.loc[train_idx]
valid_dataset = train_dataset_ori.loc[val_idx]
test_dataset = train_dataset_ori.loc[test_idx]

In [16]:
# 2. Prepare the data
#sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}  # Adjust based on your actual sentiment values

class TweetDataset(Dataset):
    def __init__(self, texts, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.texts = texts

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        # print(self.texts[idx])
        item['text'] = self.texts[idx]
        # item['text'] = torch.tensor(self.texts[idx])
        return item

    def __len__(self):
        return len(self.labels)

model_name = "kk08/CryptoBERT"

# 3. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


# 6. Create dataset
encodings = tokenizer(train_dataset['description'].to_list(),
                      truncation=True, padding="max_length", max_length=128)
train_dataset = TweetDataset(train_dataset['description'].to_list(), encodings, train_dataset['sentiment_label'].to_list())
encodings = tokenizer(valid_dataset['description'].to_list(),
                      truncation=True, padding="max_length", max_length=128)
val_dataset = TweetDataset(valid_dataset['description'].to_list(), encodings, valid_dataset['sentiment_label'].to_list())
encodings = tokenizer(test_dataset['description'].to_list(),
                      truncation=True, padding="max_length", max_length=128)
test_dataset = TweetDataset(test_dataset['description'].to_list(), encodings, test_dataset['sentiment_label'].to_list())

# 8. Initialize model
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # Set to 3 for Negative, Neutral, Positive
    ignore_mismatched_sizes=True  # This allows us to override the 2-label head
)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kk08/CryptoBERT and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Check the output size (number of classes)
print("Number of output classes:", model.classifier.in_features)

print("Number of output classes:", model.classifier.out_features)

print(model.config.hidden_size)

Number of output classes: 768
Number of output classes: 3
768


In [18]:
class PrintValidationStatsCallback(TrainerCallback):
    def __init__(self, trainer):
        super().__init__()
        self.trainer = trainer

    def on_evaluate(self, args, state, control, **kwargs):
        # Get validation metrics from the last evaluation (already computed)
        if state.log_history:
          # if len(state.log_history) >= 2:
          #   train_metrics = state.log_history[-2]
          #   train_loss = train_metrics.get("loss", "N/A")
          eval_metrics = state.log_history[-1]  # Assumes evaluation happens at epoch end
          print(eval_metrics)

          # Compute training metrics via prediction (no evaluation loop)
          train_pred = self.trainer.predict(self.trainer.train_dataset)
          train_metrics = self.trainer.compute_metrics(train_pred)
          print(train_metrics)

          print(f"\nEpoch {state.epoch} Training Stats:")
          # if train_loss !='N/A':
          #   print(f"  Training Loss: {train_metrics.get('loss', 'N/A'):.4f}")
          print(f"  Training Accuracy: {train_metrics['accuracy']:.4f}")
          print(f"  Training F1 Score: {train_metrics['f1']:.4f}")
          print(f"  Training Precision (class 0): {train_metrics['precision0']:.4f}")
          print(f"  Training Precision (class 1): {train_metrics['precision1']:.4f}")
          print(f"  Training Precision (class 2): {train_metrics['precision2']:.4f}")

          print(f"\nEpoch {state.epoch} Validation Stats:")
          # print(f"  Validation Loss: {eval_metrics['eval_loss']:.4f}")
          print(f"  Validation Accuracy: {eval_metrics['eval_accuracy']:.4f}")
          print(f"  Validation F1 Score: {eval_metrics['eval_f1']:.4f}")
          print(f"  Validation Precision (class 0): {eval_metrics['eval_precision0']:.4f}")
          print(f"  Validation Precision (class 1): {eval_metrics['eval_precision1']:.4f}")
          print(f"  Validation Precision (class 2): {eval_metrics['eval_precision2']:.4f}")

In [19]:
from copy import deepcopy
import logging
logging.basicConfig(level=logging.INFO)
# 8. Initialize model

# 9. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    # eval_strategy="steps",
    save_strategy="epoch",
    logging_strategy="epoch",  # Log every steps
    evaluation_strategy="epoch", # Evaluate every steps
    # load_best_model_at_end=True,
    # eval_accumulation_steps=1,
    report_to=None,          # Disable external logging (e.g., WANDB),
    # logging_steps=1,#0.2,
    # eval_steps=1,#0.2,
    # save_steps=1,#0.2,
    log_level='debug',
    load_best_model_at_end=True,     # Load the best model (based on metric) at the end
    metric_for_best_model="eval_loss",      # Use F1-score to determine the best model
    greater_is_better=False,          # Higher F1-score is better
    save_total_limit=1,
    # prediction_loss_only=True
)


def compute_metrics(pred):
      labels = pred.label_ids
      preds = np.argmax(pred.predictions, axis=1)

      # Calculate accuracy
      accuracy = accuracy_score(labels, preds)

      # Calculate F1-score (macro average for multi-class)
      f1 = f1_score(labels, preds, average='macro')
      prec = precision_score(labels, preds, average=None)
      prec = [float(x) for x in prec]
      return {
          # 'loss':
          'accuracy': accuracy,
          'f1': f1,
          'precision0': prec[0],
          'precision1': prec[1],
          'precision2': prec[2]
      }

# # 11. Create Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics
# )

trainer = Trainer(  # Assuming CustomTrainer was a typo; replace with your actual CustomTrainer if needed
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # callbacks=[PrintValidationStatsCallback(trainer=trainer)]  # Add the custom callback
)
# Create the callback and pass the trainer instance
callback = PrintValidationStatsCallback(trainer=trainer)

# Add the callback to the trainer
trainer.add_callback(callback)

# 12. Train the model
trainer.train()

# 13. Evaluate on test set
test_results = trainer.evaluate(val_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Currently training with a batch size of: 128
***** Running training *****
  Num examples = 36,453
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1,425
  Number of trainable parameters = 109,484,547
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision0,Precision1,Precision2
1,0.6752,0.608624,0.729148,0.66532,0.781366,0.629797,0.632712
2,0.5102,0.584002,0.751536,0.699238,0.816031,0.658757,0.631704
3,0.3746,0.624291,0.744732,0.70032,0.837448,0.627,0.607801
4,0.2436,0.804771,0.748025,0.688693,0.798311,0.711429,0.610491
5,0.1659,0.870025,0.748025,0.69475,0.81217,0.6711,0.613995



***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.608624279499054, 'eval_accuracy': 0.7291483757682178, 'eval_f1': 0.6653197120913773, 'eval_precision0': 0.7813656239488732, 'eval_precision1': 0.6297968397291196, 'eval_precision2': 0.6327116212338594, 'eval_runtime': 30.9748, 'eval_samples_per_second': 147.087, 'eval_steps_per_second': 1.162, 'epoch': 1.0, 'step': 285}


Saving model checkpoint to ./results/checkpoint-285
Configuration saved in ./results/checkpoint-285/config.json


{'accuracy': 0.7816366279867226, 'f1': 0.7299551378403956, 'precision0': 0.8239779707689049, 'precision1': 0.6933962264150944, 'precision2': 0.7163362952836637}

Epoch 1.0 Training Stats:
  Training Accuracy: 0.7816
  Training F1 Score: 0.7300
  Training Precision (class 0): 0.8240
  Training Precision (class 1): 0.6934
  Training Precision (class 2): 0.7163

Epoch 1.0 Validation Stats:
  Validation Accuracy: 0.7291
  Validation F1 Score: 0.6653
  Validation Precision (class 0): 0.7814
  Validation Precision (class 1): 0.6298
  Validation Precision (class 2): 0.6327


Model weights saved in ./results/checkpoint-285/model.safetensors

***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.5840016007423401, 'eval_accuracy': 0.7515364354697103, 'eval_f1': 0.6992379085865057, 'eval_precision0': 0.8160310734463276, 'eval_precision1': 0.6587570621468927, 'eval_precision2': 0.6317044100119189, 'eval_runtime': 31.0238, 'eval_samples_per_second': 146.855, 'eval_steps_per_second': 1.16, 'epoch': 2.0, 'step': 570}


Saving model checkpoint to ./results/checkpoint-570
Configuration saved in ./results/checkpoint-570/config.json


{'accuracy': 0.8642361396867199, 'f1': 0.8376530407701831, 'precision0': 0.9056943506639658, 'precision1': 0.7971363765239581, 'precision2': 0.8019209354120267}

Epoch 2.0 Training Stats:
  Training Accuracy: 0.8642
  Training F1 Score: 0.8377
  Training Precision (class 0): 0.9057
  Training Precision (class 1): 0.7971
  Training Precision (class 2): 0.8019

Epoch 2.0 Validation Stats:
  Validation Accuracy: 0.7515
  Validation F1 Score: 0.6992
  Validation Precision (class 0): 0.8160
  Validation Precision (class 1): 0.6588
  Validation Precision (class 2): 0.6317


Model weights saved in ./results/checkpoint-570/model.safetensors

***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.624290943145752, 'eval_accuracy': 0.7447322212467077, 'eval_f1': 0.7003204856770532, 'eval_precision0': 0.8374477781997721, 'eval_precision1': 0.627, 'eval_precision2': 0.6078006500541712, 'eval_runtime': 30.9403, 'eval_samples_per_second': 147.252, 'eval_steps_per_second': 1.164, 'epoch': 3.0, 'step': 855}


Saving model checkpoint to ./results/checkpoint-855
Configuration saved in ./results/checkpoint-855/config.json


{'accuracy': 0.9269470276794777, 'f1': 0.9141052756539777, 'precision0': 0.9712436083876718, 'precision1': 0.8536617148814177, 'precision2': 0.8757705708925221}

Epoch 3.0 Training Stats:
  Training Accuracy: 0.9269
  Training F1 Score: 0.9141
  Training Precision (class 0): 0.9712
  Training Precision (class 1): 0.8537
  Training Precision (class 2): 0.8758

Epoch 3.0 Validation Stats:
  Validation Accuracy: 0.7447
  Validation F1 Score: 0.7003
  Validation Precision (class 0): 0.8374
  Validation Precision (class 1): 0.6270
  Validation Precision (class 2): 0.6078


Model weights saved in ./results/checkpoint-855/model.safetensors
Deleting older checkpoint [results/checkpoint-285] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.8047710657119751, 'eval_accuracy': 0.7480245829675154, 'eval_f1': 0.6886925917837678, 'eval_precision0': 0.7983108108108108, 'eval_precision1': 0.7114285714285714, 'eval_precision2': 0.6104910714285714, 'eval_runtime': 30.9358, 'eval_samples_per_second': 147.273, 'eval_steps_per_second': 1.164, 'epoch': 4.0, 'step': 1140}


Saving model checkpoint to ./results/checkpoint-1140
Configuration saved in ./results/checkpoint-1140/config.json


{'accuracy': 0.962088168326338, 'f1': 0.9532321024074819, 'precision0': 0.9716566866267465, 'precision1': 0.9559934805156319, 'precision2': 0.9377007962005867}

Epoch 4.0 Training Stats:
  Training Accuracy: 0.9621
  Training F1 Score: 0.9532
  Training Precision (class 0): 0.9717
  Training Precision (class 1): 0.9560
  Training Precision (class 2): 0.9377

Epoch 4.0 Validation Stats:
  Validation Accuracy: 0.7480
  Validation F1 Score: 0.6887
  Validation Precision (class 0): 0.7983
  Validation Precision (class 1): 0.7114
  Validation Precision (class 2): 0.6105


Model weights saved in ./results/checkpoint-1140/model.safetensors
Deleting older checkpoint [results/checkpoint-855] due to args.save_total_limit
Saving model checkpoint to ./results/checkpoint-1425
Configuration saved in ./results/checkpoint-1425/config.json
Model weights saved in ./results/checkpoint-1425/model.safetensors
Deleting older checkpoint [results/checkpoint-1140] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequ

{'eval_loss': 0.8700250387191772, 'eval_accuracy': 0.7480245829675154, 'eval_f1': 0.6947503153164526, 'eval_precision0': 0.8121702427013718, 'eval_precision1': 0.6711003627569528, 'eval_precision2': 0.6139954853273137, 'eval_runtime': 31.324, 'eval_samples_per_second': 145.448, 'eval_steps_per_second': 1.149, 'epoch': 5.0, 'step': 1425}


Saving model checkpoint to ./results/checkpoint-1425
Configuration saved in ./results/checkpoint-1425/config.json


{'accuracy': 0.9746248594080048, 'f1': 0.9689989862649693, 'precision0': 0.9863444434462313, 'precision1': 0.9549093973872735, 'precision2': 0.957579185520362}

Epoch 5.0 Training Stats:
  Training Accuracy: 0.9746
  Training F1 Score: 0.9690
  Training Precision (class 0): 0.9863
  Training Precision (class 1): 0.9549
  Training Precision (class 2): 0.9576

Epoch 5.0 Validation Stats:
  Validation Accuracy: 0.7480
  Validation F1 Score: 0.6948
  Validation Precision (class 0): 0.8122
  Validation Precision (class 1): 0.6711
  Validation Precision (class 2): 0.6140


Model weights saved in ./results/checkpoint-1425/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-570 (score: 0.5840016007423401).
Deleting older checkpoint [results/checkpoint-1425] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.



***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.5840016007423401, 'eval_accuracy': 0.7515364354697103, 'eval_f1': 0.6992379085865057, 'eval_precision0': 0.8160310734463276, 'eval_precision1': 0.6587570621468927, 'eval_precision2': 0.6317044100119189, 'eval_runtime': 31.3565, 'eval_samples_per_second': 145.297, 'eval_steps_per_second': 1.148, 'epoch': 5.0, 'step': 1425}
{'accuracy': 0.8642361396867199, 'f1': 0.8376530407701831, 'precision0': 0.9056943506639658, 'precision1': 0.7971363765239581, 'precision2': 0.8019209354120267}

Epoch 5.0 Training Stats:
  Training Accuracy: 0.8642
  Training F1 Score: 0.8377
  Training Precision (class 0): 0.9057
  Training Precision (class 1): 0.7971
  Training Precision (class 2): 0.8019

Epoch 5.0 Validation Stats:
  Validation Accuracy: 0.7515
  Validation F1 Score: 0.6992
  Validation Precision (class 0): 0.8160
  Validation Precision (class 1): 0.6588
  Validation Precision (class 2): 0.6317

Test set evaluation results:
Test accuracy: 0.7515
Test loss: 0.5840


In [20]:

# # 13. Save the model
model.save_pretrained(MODEL_FILE)
tokenizer.save_pretrained(MODEL_FILE)


Configuration saved in /content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/config.json
Model weights saved in /content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/model.safetensors
tokenizer config file saved in /content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/special_tokens_map.json


('/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/tokenizer_config.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/special_tokens_map.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/vocab.txt',
 '/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/added_tokens.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_cryptobert_kk08_model/tokenizer.json')

In [21]:
# 13. Evaluate on test set
test_results = trainer.evaluate(val_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


***** Running Evaluation *****
  Num examples = 4556
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.



***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.5840016007423401, 'eval_accuracy': 0.7515364354697103, 'eval_f1': 0.6992379085865057, 'eval_precision0': 0.8160310734463276, 'eval_precision1': 0.6587570621468927, 'eval_precision2': 0.6317044100119189, 'eval_runtime': 30.0603, 'eval_samples_per_second': 151.562, 'eval_steps_per_second': 1.198, 'epoch': 5.0, 'step': 1425}
{'accuracy': 0.8642361396867199, 'f1': 0.8376530407701831, 'precision0': 0.9056943506639658, 'precision1': 0.7971363765239581, 'precision2': 0.8019209354120267}

Epoch 5.0 Training Stats:
  Training Accuracy: 0.8642
  Training F1 Score: 0.8377
  Training Precision (class 0): 0.9057
  Training Precision (class 1): 0.7971
  Training Precision (class 2): 0.8019

Epoch 5.0 Validation Stats:
  Validation Accuracy: 0.7515
  Validation F1 Score: 0.6992
  Validation Precision (class 0): 0.8160
  Validation Precision (class 1): 0.6588
  Validation Precision (class 2): 0.6317

Test set evaluation results:
Test accuracy: 0.7515
Test loss: 0.5840


In [22]:
# 13. Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


***** Running Evaluation *****
  Num examples = 4558
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.



***** Running Prediction *****
  Num examples = 36453
  Batch size = 128
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


{'eval_loss': 0.5783064961433411, 'eval_accuracy': 0.7501096972356297, 'eval_f1': 0.6979366225242208, 'eval_precision0': 0.8178609248146841, 'eval_precision1': 0.6555299539170507, 'eval_precision2': 0.6219369894982497, 'eval_runtime': 31.0352, 'eval_samples_per_second': 146.865, 'eval_steps_per_second': 1.16, 'epoch': 5.0, 'step': 1425}
{'accuracy': 0.8642361396867199, 'f1': 0.8376530407701831, 'precision0': 0.9056943506639658, 'precision1': 0.7971363765239581, 'precision2': 0.8019209354120267}

Epoch 5.0 Training Stats:
  Training Accuracy: 0.8642
  Training F1 Score: 0.8377
  Training Precision (class 0): 0.9057
  Training Precision (class 1): 0.7971
  Training Precision (class 2): 0.8019

Epoch 5.0 Validation Stats:
  Validation Accuracy: 0.7501
  Validation F1 Score: 0.6979
  Validation Precision (class 0): 0.8179
  Validation Precision (class 1): 0.6555
  Validation Precision (class 2): 0.6219

Test set evaluation results:
Test accuracy: 0.7501
Test loss: 0.5783


In [None]:
# Extract predictions (logits), probabilities, and true labels
predictions_output = trainer.predict(val_dataset)
logits = predictions_output.predictions  # Raw logits from the model
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()  # Convert logits to probabilities
preds = np.argmax(logits, axis=1)  # Predicted labels
true_labels = predictions_output.label_ids  # True labels

# 14. Evaluate on test set (already computed metrics are available in predictions_output)
test_results = trainer.evaluate(val_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


***** Running Prediction *****
  Num examples = 9114
  Batch size = 128



***** Running Evaluation *****
  Num examples = 9114
  Batch size = 128



Test set evaluation results:
Test accuracy: 0.7607
Test loss: 0.6856


In [None]:
predictions_output.label_ids

array([0, 0, 2, ..., 2, 0, 0])

In [None]:
# 15. Identify incorrect predictions
incorrect_indices = np.where(preds != true_labels)[0]  # Indices where predictions are wrong
num_incorrect = len(incorrect_indices)

# def sentiment_map(text):
#   if 'Bullish' in text:
#     return 0
#   elif 'Neutral' in text:
#     return 1
#   else:
#     return 2

print(f"\nNumber of incorrect predictions: {num_incorrect}")
print("\nIncorrect predictions:")
for idx in incorrect_indices[:10]:  # Limit to first 10 for brevity
    print('Text: ', texts[idx])#, labels[idx])
    print(f"Sample {idx}:")
    print(f"True label: {true_labels[idx]}")
    print(f"Predicted label: {preds[idx]}")
    print(f"Probabilities: {probs[idx]}")




Number of incorrect predictions: 2181

Incorrect predictions:
Text:  $AVAX Stacking on support..
Sample 2:
True label: 2
Predicted label: 0
Probabilities: [0.8667128  0.00147451 0.13181275]
Text:  $BTC When you zoom out, all good And we burned off that much needed RSI
Sample 5:
True label: 1
Predicted label: 2
Probabilities: [0.33806583 0.05582584 0.6061083 ]
Text:  $BTC - quick video update on #bitcoin
Sample 6:
True label: 0
Predicted label: 2
Probabilities: [1.5299396e-01 8.1523746e-04 8.4619075e-01]
Text:  25E buy from a $2M wallet Few 10E buys earlier from similar whale wallets too All time high within touching distance… $200M is programmed $PAAL
Sample 7:
True label: 0
Predicted label: 2
Probabilities: [0.3475212  0.00519585 0.647283  ]
Text:  Now that fake Su Zhu sold his $ATOR bags it looks like it’s ready now $2+ soon
Sample 10:
True label: 2
Predicted label: 0
Probabilities: [0.97747666 0.00108301 0.02144037]
Text:  $FET here is the obstacle he must overcome to go straight t

In [None]:
# 9. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    eval_strategy="steps",
    save_strategy="no",
    logging_strategy="epoch",  # Log every epoch
    evaluation_strategy="epoch", # Evaluate every epoch
    # load_best_model_at_end=True,
    # eval_accumulation_steps=1,
    report_to=None,          # Disable external logging (e.g., WANDB),
    # logging_steps=0.2,
    log_level='info',
    # prediction_loss_only=True
)

# 10. Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = (preds == labels).mean()
    return {'accuracy': accuracy}

# 11. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 12. Train the model
trainer.train()

# 13. Evaluate on test set
test_results = trainer.evaluate(val_dataset)
print("\nTest set evaluation results:")
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test loss: {test_results['eval_loss']:.4f}")


# # 13. Save the model
model.save_pretrained("./trained_bert_model")
tokenizer.save_pretrained("./trained_bert_model")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 36,453
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1,140
  Number of trainable parameters = 109,484,547


Step,Training Loss,Validation Loss,Accuracy
342,0.6484,0.61503,0.728111
684,0.4538,0.664387,0.749835
1026,0.3095,0.733738,0.752359



***** Running Evaluation *****
  Num examples = 9114
  Batch size = 128

***** Running Evaluation *****
  Num examples = 9114
  Batch size = 128

***** Running Evaluation *****
  Num examples = 9114
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)



***** Running Evaluation *****
  Num examples = 9114
  Batch size = 128


Configuration saved in ./trained_bert_model/config.json



Test set evaluation results:
Test accuracy: 0.7508
Test loss: 0.7392


Model weights saved in ./trained_bert_model/model.safetensors
tokenizer config file saved in ./trained_bert_model/tokenizer_config.json
Special tokens file saved in ./trained_bert_model/special_tokens_map.json


('./trained_bert_model/tokenizer_config.json',
 './trained_bert_model/special_tokens_map.json',
 './trained_bert_model/vocab.txt',
 './trained_bert_model/added_tokens.json',
 './trained_bert_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# # 13. Save the model
model.save_pretrained("/content/gdrive/MyDrive/CS7641-models/trained_bert_model")
tokenizer.save_pretrained("/content/gdrive/MyDrive/CS7641-models/trained_bert_model")

# print("Training completed! Model saved to './trained_bert_model'")

Configuration saved in /content/gdrive/MyDrive/CS7641-models/trained_bert_model/config.json


Mounted at /content/gdrive


Model weights saved in /content/gdrive/MyDrive/CS7641-models/trained_bert_model/model.safetensors
tokenizer config file saved in /content/gdrive/MyDrive/CS7641-models/trained_bert_model/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/CS7641-models/trained_bert_model/special_tokens_map.json


('/content/gdrive/MyDrive/CS7641-models/trained_bert_model/tokenizer_config.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_bert_model/special_tokens_map.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_bert_model/vocab.txt',
 '/content/gdrive/MyDrive/CS7641-models/trained_bert_model/added_tokens.json',
 '/content/gdrive/MyDrive/CS7641-models/trained_bert_model/tokenizer.json')