In [1]:
%pip install transformers evaluate peft

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, peft, evaluate
Successfully installed evaluate-0.4.1 peft-0.10.0 responses-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
from transformers import AutoTokenizer,DataCollatorWithPadding,AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate
import torch
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from sklearn.metrics import classification_report  


id2label = {0: "positive", 1: "neutral", 2: "negative"}
label2id = {"positive": 0, "neutral": 1, "negative": 2}

2024-04-18 19:45:02.469679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 19:45:02.469773: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 19:45:02.577868: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def model_prediction(model, tokenizer, text_train, text_test, labels_train, labels_test, labels):
    train_preds = []
    device = next(model.parameters()).device  # Get the device of the model
    for text in tqdm(text_train):
        inputs = tokenizer(text, return_tensors="pt").to(device)  # Move tensors to model's device
        logits = model(**inputs)
        predicted_class_id = logits.logits.argmax().item()
        train_preds.append(id2label[predicted_class_id])
        
    test_preds = []
    for text in tqdm(text_test):
        inputs = tokenizer(text, return_tensors="pt").to(device)  # Move tensors to model's device
        logits = model(**inputs)
        predicted_class_id = logits.logits.argmax().item()
        test_preds.append(id2label[predicted_class_id])

#     train_report = classification_report(labels_train, train_preds, target_names=labels, digits=3)
#     test_report = classification_report(labels_test, test_preds, target_names=labels, digits=3)

#     print("Train Classification Report: ")
#     print(train_report)
    
#     print("\nTest Classification Report: ")
#     print(test_report)


In [4]:
class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, label2id):
        self.encodings = encodings
        self.labels = [label2id[value] for value in labels]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to('cuda:0') for key, val in self.encodings.items()}
        # Move labels tensor to 'cuda:0'
        item['labels'] = torch.tensor(self.labels[idx]).to('cuda:0')
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
def compute_metrics(eval_pred):
    accuracy= evaluate.load('accuracy')
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
def model_train(model,tokenizer,path,name):
    data=pd.read_csv(path,encoding='latin-1', header=None)
    data.columns=['labels','text']

    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    texts = data["text"].to_list()
    labels = data["labels"].to_list()
    text_train, text_test, labels_train, labels_test = train_test_split(  
        texts, labels, test_size=0.20, random_state=42,   
    )  
    tokenized_text_train = tokenizer(text_train,truncation=True)
    tokenized_text_test = tokenizer(text_test,truncation=True)

    print("train size:", len(labels_train))
    print("test size:", len(labels_test))



    train_dataset = BertDataset(tokenized_text_train, labels_train, label2id)
    test_dataset = BertDataset(tokenized_text_test, labels_test, label2id)
    
    if name=='distilbert':
        peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=8,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin']
                            )
    else:
         peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=8,
                        lora_alpha=32,
                        lora_dropout=0.01
                            )
        

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    training_args = TrainingArguments(
    output_dir=f"new_model_{name}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"
    )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )
    trainer.train()
    model = trainer.model
    
    
    return model,text_train,text_test,labels_train,labels_test,labels

In [8]:
path='/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv'
tokenizer_FinancialBERT = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model_FinancialBERT = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3)

tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/464k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [10]:
model_FinancialBERT.to("cuda:0")
model_FinancialBERT,text_train,text_test,labels_train,labels_test,labels=model_train(model_FinancialBERT,tokenizer_FinancialBERT,path,"FinancialBERT")

train size: 3876
test size: 970
trainable params: 297,219 || all params: 110,051,334 || trainable%: 0.27007305517986724


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.76962,0.584536
2,No log,0.641242,0.71134
3,No log,0.534681,0.768041
4,No log,0.480188,0.795876
5,1.377700,0.430278,0.824742
6,1.377700,0.389697,0.836082
7,1.377700,0.348974,0.858763
8,1.377700,0.314565,0.88866
9,0.390900,0.280993,0.910309
10,0.390900,0.251621,0.920619


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



In [None]:
model_prediction(model_finbert,tokenizer_finbert,text_train,text_test,labels_train,labels_test,labels)

In [11]:
tokenizer_finbert = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model_finbert = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [12]:
model_finbert.to("cuda:0")
model_finbert,text_train,text_test,labels_train,labels_test,labels=model_train(model_finbert,tokenizer_finbert,path,"finbert")

train size: 3876
test size: 970
trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.951978,0.601031
2,No log,0.596295,0.786598
3,No log,0.520068,0.804124
4,No log,0.456558,0.806186
5,0.942400,0.374503,0.878351
6,0.942400,0.308682,0.896907
7,0.942400,0.277229,0.903093
8,0.942400,0.267559,0.906186
9,0.348900,0.261233,0.906186
10,0.348900,0.258094,0.909278




In [17]:
tokenizer_distilbert = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
model_distilbert = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student", num_labels=3)

In [18]:
model_distilbert.to("cuda:0")
model_distilbert,text_train,text_test,labels_train,labels_test,labels=model_train(model_distilbert,tokenizer_distilbert,path,"distilbert")

train size: 3876
test size: 970
trainable params: 666,627 || all params: 135,993,606 || trainable%: 0.4901899579014031


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.817471,0.591753
2,No log,0.759601,0.620619
3,No log,0.713132,0.669072
4,No log,0.677984,0.695876
5,0.773800,0.654519,0.707216
6,0.773800,0.634968,0.720619
7,0.773800,0.622085,0.727835
8,0.773800,0.611244,0.736082
9,0.611400,0.602229,0.739175
10,0.611400,0.595562,0.742268




In [19]:
model_FinancialBERT.save_pretrained('model_FinancialBERT')
model_FinancialBERT.save_pretrained('model_distilbert')
model_finbert.save_pretrained('model_finbert')

In [None]:
lora_config = LoraConfig.from_pretrained('/content/drive/MyDrive/model_FinancialBERT')
model = get_peft_model(model_FinancialBERT, lora_config)

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig


In [None]:
tokenizer_FinancialBERT = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model_FinancialBERT = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3)