In [1]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_DISABLED=true
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AdamW, 
    AutoConfig, 
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification
)

from datasets import Dataset, load_metric

import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

env: TOKENIZERS_PARALLELISM=false
env: WANDB_DISABLED=true


In [2]:
class args:
    model = 'ProsusAI/finbert'

In [3]:
df = pd.read_csv('all-data.csv', names = ['labels','messages'], 
                 encoding='ISO-8859-1')

In [4]:
df = df[['messages', 'labels']]
df.head()

Unnamed: 0,messages,labels
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [5]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'])
df['labels'].value_counts()

1    2879
2    1363
0     604
Name: labels, dtype: int64

In [6]:
X, y = df['messages'].values, df['labels'].values

# train : test = 0.9 : 0.1
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, stratify=y)

# train : valid = 0.8 : 0.2
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2, stratify=ytrain)

# train : valid : test = 0.72 : 0.18 : 0.10 (stratified on 'labels')

In [7]:
train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain})
valid_dataset_raw = Dataset.from_dict({'text':xvalid, 'labels':yvalid})

In [8]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(args.model)

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

In [10]:
train_dataset_raw

Dataset({
    features: ['text', 'labels'],
    num_rows: 3488
})

In [12]:
train_dataset = train_dataset_raw.map(tokenize_fn, batched=True)
valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(args.model)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [14]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

In [15]:
train_args = TrainingArguments(
    './Finbert Trained/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2*16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,    
    do_eval=True,
    do_train=True,
    do_predict=True,
    evaluation_strategy='epoch',
    save_strategy="no",
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 