In [25]:
import torch

In [26]:
torch.cuda.is_available()

True

In [27]:
import json
with open('./combined_labeled_sentences.json', 'r', encoding='utf-8') as f:
  dataset=json.load(f)

In [28]:
dataset_train = {x:dataset[x] for x in ['abt22_sentences.json', 'abt23_sentences.json', 'dal22_sentences.json', 'dal23_sentences.json', 'ehc22_sentences.json', 'ehc23_sentences.json', 'f22_sentences.json', 'f23_sentences.json', 'fdx22_sentences.json', 'fdx23_sentences.json', 'holx22_sentences.json', 'holx23_sentences.json', 'ogn22_sentences.json', 'ogn23_sentences.json', 'tsla22_sentences.json', 'ups22_sentences.json']}
dataset_test = {x:dataset[x] for x in ['tsla23_sentences.json',  'ups23_sentences.json', 'zbh22_sentences.json', 'zbh23_sentences.json']} 

import pandas as pd

dataset_train_list = []
for kulcs in dataset_train:
  for elements in dataset_train[kulcs]:
    dataset_train_list.append(elements)
    
dataset_test_list = []
for kulcs in dataset_test:
  for elements in dataset_test[kulcs]:
    dataset_test_list.append(elements)
    
df_train=pd.DataFrame(dataset_train_list, columns=['text', 'label'])
df_test=pd.DataFrame(dataset_test_list, columns=['text', 'label'])

In [29]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float).unsqueeze(0)
        }


In [30]:
model_name = "yiyanghkust/finbert-tone"
model = torch.load('FullTrain1')
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)



In [54]:
for param in model.parameters():
    param.requires_grad = True

In [32]:
model.problem_type='multi_label_classification'
model.config.problem_type='multi_label_classification'

In [33]:
for param in model.classifier.parameters():
    param.requires_grad = True

In [34]:
df_train=df_train.sample(frac=1)

train_data = CustomDataset(
    texts=df_train['text'].tolist()[:35000],
    labels=df_train['label'].tolist()[:35000],
    tokenizer=tokenizer,
    max_len=256
)

val_data = CustomDataset(
    texts=df_train['text'].tolist()[35000:],
    labels=df_train['label'].tolist()[35000:],
    tokenizer=tokenizer,
    max_len=256
)

test_data = CustomDataset(
    texts=df_test['text'].tolist(),
    labels=df_test['label'].tolist(),
    tokenizer=tokenizer,
    max_len=256
)

train_dataloader = DataLoader(train_data, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_data, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

In [35]:
data_collator = DataCollatorWithPadding(tokenizer)

In [38]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    preds = p.predictions
    preds = torch.sigmoid(torch.tensor(preds)).numpy() > 0.5
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [62]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=32,
    save_strategy='epoch',
    warmup_steps=500,
    weight_decay=0.7,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True
)

In [63]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        pos_weight = torch.tensor([4.0], device='cuda')
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [64]:
model.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [65]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [66]:
for i in [11,12]:
    trainer.train()
    print(i)
    torch.save(model, f'FullTrain{i}')
    print('saved')

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2436,0.433154,0.91598,0.809244


11
saved


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 