In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import re
from transformers import TrainerCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

def remove_wallets(text):
    return ' '.join(word for word in str(text).split() if len(word) < 40)

def clean_text(text):
    # Convert to string and make a working copy
    text = str(text)

    # Remove Asian characters
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # Remove mentions, hashtags, stock symbols, and forward slashes with words
    text = re.sub(r'[@][A-Za-z0-9_]+', '', text)
    text = re.sub(r'[#][A-Za-z0-9_]+', '', text)
    text = re.sub(r'[$][A-Za-z0-9_ ]+', '', text)
    text = re.sub(r'[/][A-Za-z0-9_ ]+', '', text)

    # Replace specific patterns
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'â€™', '\'', text)
    text = re.sub(r'\'', '\'', text)
    text = re.sub(r'', '', text)
    text = re.sub(r'&;', '\'', text)
    text = re.sub(r'\.X', '', text)
    text = re.sub(r'\.x', '', text)

    # Clean up multiple spaces
    text = re.sub(r'  +', ' ', text)

    # Remove @ symbol and pipe characters
    text = re.sub(r'@', '', text)
    text = re.sub(r' \| ', '', text)
    text = re.sub(r'\|', '', text)

    # Normalize multiple dots
    text = re.sub(r'\.\.+', '...', text)

    # Convert to lowercase and remove quotes
    text = text.lower()
    text = re.sub(r'"', '', text)

    # Remove wallets
    text = remove_wallets(text)

    return text.strip()

def sentiment_map(text):
  if 'Bullish' in text:
    return 0
  elif 'Neutral' in text:
    return 1
  else:
    return 2

In [3]:
import torch
import numpy as np
import random

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create a generator for the split
generator = torch.Generator().manual_seed(seed)


In [4]:
data = load_dataset("StephanAkkerman/financial-tweets-crypto")
train_dataset_ori = data['train']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

crypto.csv:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/57935 [00:00<?, ? examples/s]

In [5]:

print(f'No. of data: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: data['sentiment'] is not None)
print(f'No. of data after remove sentiment equals to none: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: data['tweet_type']!='quote tweet')
print(f'No. of data after remove quote tweet: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.filter(lambda data: len(data['description'].split(' '))>1)
print(f'No. of data after remove short text: {len(train_dataset_ori)}')
train_dataset_ori = train_dataset_ori.to_pandas()
train_dataset_ori['description'] = train_dataset_ori['description'].apply(clean_text)
train_dataset_ori.drop_duplicates(inplace=True, ignore_index=True)
print(f'No. of data after remove duplicates: {len(train_dataset_ori)}') # Make sure the records here remains the same after remove duplicates, else the following train test split might be different
train_dataset_ori['sentiment_label'] = train_dataset_ori['sentiment'].apply(sentiment_map)



No. of data: 57935


Filter:   0%|          | 0/57935 [00:00<?, ? examples/s]

No. of data after remove sentiment equals to none: 48692


Filter:   0%|          | 0/48692 [00:00<?, ? examples/s]

No. of data after remove quote tweet: 46866


Filter:   0%|          | 0/46866 [00:00<?, ? examples/s]

No. of data after remove short text: 45567
No. of data after remove duplicates: 45567


In [6]:
num_samples = len(train_dataset_ori)
# Create an array of indices
indices = np.arange(num_samples)

# Shuffle the indices randomly
np.random.seed(42)  # Set a seed for reproducibility
np.random.shuffle(indices)

# Split the indices into train, validation, and test sets
train_size = int(num_samples * 0.8)  # 80% for training
val_size = int(num_samples * 0.1)  # 10% for validation
test_size = num_samples - train_size - val_size  # 10% for testing

# Split the shuffled indices
train_idx = indices[:train_size]
val_idx = indices[train_size:train_size + val_size]
test_idx = indices[train_size + val_size:]

# Print the sizes of each split
print(f"Train size: {len(train_idx)}")
print(f"Validation size: {len(val_idx)}")
print(f"Test size: {len(test_idx)}")

Train size: 36453
Validation size: 4556
Test size: 4558


In [7]:
test_dataset = train_dataset_ori.loc[test_idx]

In [8]:
test_dataset['description']

Unnamed: 0,description
22423,good coin > []( > . just waiting on trigger.
11094,after that daily im fairly confident the 5 wav...
18642,": the 200 ema on the 4h has acted as support, ..."
38163,live in 15 minutes! > []( > join me live at 10...
39850,live in 5 minutes! > []( > join me live in 1 h...
...,...
11284,keep an eye on this may drop from here on brea...
44732,tomorrow i will update my thoughts on 🤝
38158,added . looks quite decent. will share plans s...
860,. full send. exe 0.10 next


In [9]:
model_name = "ElKulako/cryptobert"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 3)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, max_length=64, truncation=True, padding = 'max_length')


preds = pipe(test_dataset['description'].to_list())

tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cuda:0


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [10]:
preds[0]

{'label': 'Neutral', 'score': 0.5603097677230835}

In [11]:
test_dataset['pred_dict'] = preds

In [12]:
sentiment_map = {'Bullish': 0, 'Neutral': 1, 'Bearish': 2}
test_dataset['pred_sent_label'] = test_dataset['pred_dict'].apply(lambda x: sentiment_map[x['label']])

In [13]:
accuracy = accuracy_score(test_dataset['sentiment_label'], test_dataset['pred_sent_label'])

# Calculate F1-score (macro average for multi-class)
f1 = f1_score(test_dataset['sentiment_label'], test_dataset['pred_sent_label'], average='macro')
prec = precision_score(test_dataset['sentiment_label'], test_dataset['pred_sent_label'], average=None)

In [14]:
accuracy

0.3905221588415972

In [15]:
f1

0.3368324376616534

In [16]:
prec

array([0.69792252, 0.17168391, 0.26420891])