In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import pandas as pd
from ast import literal_eval
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertModel, RobertaForSequenceClassification, AutoModelForSequenceClassification
from transformers import AutoTokenizer, BertTokenizer, BertTokenizerFast, BertConfig, RobertaTokenizer
from datasets import Dataset
import torch
import numpy as np
from scipy.special import softmax, expit

import os
#os.environ['CUDA_VISIBLE_DEVICES']='1'

In [None]:
import random

seed_val = 34

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed(seed_val)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -av /content/drive/MyDrive/PolyU/comp6709/SemEval2018-Task1-all-data/English/V-oc /content/data

In [None]:
train_df_voc = pd.read_csv('data/2018-Valence-oc-En-train.txt', sep='\t')[['Tweet', 'Intensity Class']]
test_df_voc = pd.read_csv('data/2018-Valence-oc-En-test-gold.txt', sep='\t')[['Tweet', 'Intensity Class']]
valid_df_voc = pd.read_csv('data/2018-Valence-oc-En-dev.txt', sep='\t')[['Tweet', 'Intensity Class']]

In [None]:
train_df_voc.head()

In [None]:
train_df_voc['Intensity Class'].value_counts()

In [None]:
def change_label(df):
    for i, row in df.iterrows():
        ifor_val = df.loc[i,'Intensity Class'].split(':')[0]
        if int(ifor_val) < 0:
            ifor_val = 1  # neg
        elif int(ifor_val) > 0:
            ifor_val = 2  # pos
        else: 
            ifor_val = 0  # neu
        df.at[i,'Intensity Class'] = ifor_val
    return df

In [None]:
train_df_voc = change_label(train_df_voc)
valid_df_voc = change_label(valid_df_voc)
test_df_voc = change_label(test_df_voc)

train_df_voc.rename(columns={'Tweet': 'sentence', 'Intensity Class': 'label'}, inplace=True)
valid_df_voc.rename(columns={'Tweet': 'sentence', 'Intensity Class': 'label'}, inplace=True)
test_df_voc.rename(columns={'Tweet': 'sentence', 'Intensity Class': 'label'}, inplace=True)

train_df_voc.head()

In [None]:
train_df_voc['label'].value_counts()

In [None]:
def make_dataset(df, tokenizer):
  dataset_train = Dataset.from_pandas(df)
  dataset_train = dataset_train.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
  dataset_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
  return dataset_train

In [None]:
def train_model(train_df, valid_df, model_name, dir_model):
  print('-----train-----')

  if model_name == 'bert':
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  elif model_name == 'roberta':
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  elif model_name == 'bertweet':
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

  dataset_train = make_dataset(train_df, tokenizer)
  dataset_val = make_dataset(valid_df, tokenizer)

  training_args = TrainingArguments(
                  output_dir='./'+dir_model,          # output directory
                  num_train_epochs=8,              # total # of training epochs
                  per_device_train_batch_size=32,  # batch size per device during training
                  per_device_eval_batch_size=16,   # batch size for evaluation
                  warmup_steps=500,                # number of warmup steps for learning rate scheduler
                  weight_decay=0.01,               # strength of weight decay
                  logging_dir='./logs_'+dir_model,            # directory for storing logs
                  #evaluation_strategy="steps",
                  evaluation_strategy="epoch",
                  save_strategy="epoch",
                  load_best_model_at_end = True,
                  seed=seed_val,
                  overwrite_output_dir=True,
  )

  if model_name == 'bert':
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
  elif model_name == 'roberta':
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
  elif model_name == 'bertweet':
    model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=3)

  #model = model.to(device)
  model.train()

  trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=dataset_train,         # training dataset
      eval_dataset=dataset_val,            # evaluation dataset
  )

  trainer.train()
  
  return tokenizer, trainer


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_fscore_support

In [None]:
def eval_model(trainer, test_df, tokenizer, overall_types):
  print('-----eval-----')
  dataset_test = make_dataset(test_df, tokenizer)
  predict_data = trainer.predict(dataset_test)
  metrics = predict_data.metrics
  print(metrics)
  #print(np.argmax(predict_data.predictions, axis=1).flatten())
  pre_labels = np.argmax(predict_data.predictions, axis=1).flatten()
  test_df['predictions'] = pd.Series(pre_labels)

  #write_out(overall_types, metrics)
  y_true = test_df['label'].values.tolist()
  y_pred = test_df['predictions'].values.tolist()
  report = classification_report(y_true, y_pred)
  print(report)

  precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
  acc = accuracy_score(y_true, y_pred)
  print('precision: ' + str(precision))
  print('recall: ' + str(recall))
  print('f1: ' + str(f1))
  print('accuracy: ' + str(acc))

  return metrics, test_df

In [None]:
# voc, bert

tokenizer, trainer_voc = train_model(train_df_voc, valid_df_voc, 'bert', 'voc-bert')

metrics_voc, pred_df_voc = eval_model(trainer_voc, test_df_voc, tokenizer, 'voc-bert')
pred_df_voc.to_csv('pred_df_voc-bert.csv', index=False)

trainer_voc.save_model('./model_save_voc-bert')

In [None]:
# voc, roberta

tokenizer, trainer_voc = train_model(train_df_voc, valid_df_voc, 'roberta', 'voc-roberta')

metrics_voc, pred_df_voc = eval_model(trainer_voc, test_df_voc, tokenizer, 'voc-roberta')
pred_df_voc.to_csv('pred_df_voc-roberta.csv', index=False)

trainer_voc.save_model('./model_save_voc-roberta')

In [None]:
# voc, bertweet

tokenizer, trainer_voc = train_model(train_df_voc, valid_df_voc, 'bertweet', 'voc-bertweet')

metrics_voc, pred_df_voc = eval_model(trainer_voc, test_df_voc, tokenizer, 'voc-bertweet')
pred_df_voc.to_csv('pred_df_voc-bertweet.csv', index=False)

trainer_voc.save_model('./model_save_voc-bertweet')