In [1]:
import torch
from torch.optim import AdamW
from transformers import (AutoTokenizer,
                          AutoConfig,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,
                          get_linear_schedule_with_warmup,)
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import os
import evaluate

In [2]:
def load_hope_data(data_file, label_map, classification_type):
    df = pd.read_csv(data_file)
    texts = df['text'].tolist()
    labels = [label_map[sentiment] for sentiment in df[classification_type].tolist()]
    return texts, labels

In [3]:
polyhope_binary_labels = {'Hope':1, 'Not Hope':0}
polyhope_inv_binary_labels = {v: k for k, v in polyhope_binary_labels.items()}

polyhope_multi_labels = {'Not Hope':0, 'Generalized Hope':1, 'Realistic Hope':2, 'Unrealistic Hope':3}
polyhope_inv_multi_labels = {v: k for k, v in polyhope_multi_labels.items()}

train_path = '../data/train/'
val_path = '../data/val/'

In [4]:
class_type = input('Select Classification type: \n\t1] Binary.\n\t2] Multiclass.')
if class_type == '1':
    num_labels = 2
    label_type = 'binary'
    label_map = polyhope_binary_labels
else:
    num_labels = 4
    label_type =  'multiclass'
    label_map = polyhope_multi_labels

Select Classification type: 
	1] Binary.
	2] Multiclass. 2


In [5]:
train_file_list = os.listdir('../data/train/')
val_file_list = os.listdir('../data/val/')

print('Train file list.')
for i, f in enumerate(train_file_list):
    print(f'\t{i}] {f}')
tf = input('Select train file: ')
tf = int(tf)
train_file = train_path + train_file_list[tf]

print('Validation file list.')
for i, f in enumerate(val_file_list):
    print(f'\t{i}] {f}')
vf = input('Select val file: ')
vf = int(vf)
val_file = val_path + val_file_list[vf]

Train file list.
	0] hopeedi_train.csv
	1] train_polyhope_english_cleaned.csv
	2] train_polyhope_spanish.csv
	3] train_polyhope_english.csv
	4] train_polyhope_english_cleaned_noemoji.csv


Select train file:  2


Validation file list.
	0] val_polyhope_spanish.csv
	1] val_polyhope_english_cleaned.csv
	2] val_polyhope_english_cleaned_noemoji.csv
	3] val_polyhope_english.csv
	4] hopeedi_val.csv


Select val file:  0


In [6]:
train_texts, train_labels = load_hope_data(train_file, label_map, label_type)
val_texts, val_labels = load_hope_data(val_file, label_map, label_type)

In [7]:
class TextClassificationDataset(Dataset):
	def __init__(self, texts, labels,tokenizer, max_length):
		self.texts = texts
		self.labels = labels
		self.tokenizer = tokenizer
		self.max_length = max_length

	def __len__(self):
		return len(self.texts)

	def __getitem__(self, idx):
		text = self.texts[idx]
		label = self.labels[idx]
		encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
		return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': label}

In [8]:
model_list = ['bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
              'bert-large-cased',
              'dccuchile/bert-base-spanish-wwm-uncased', 
              'dccuchile/bert-base-spanish-wwm-uncased', 
              'dccuchile/albert-xxlarge-spanish'
             ]

print('Model list.')
for i, f in enumerate(model_list):
    print(f'\t{i}] {f}')
m = input('Select model: ')
m = int(m)
model_name = model_list[m]

Model list.
	0] bert-base-uncased
	1] bert-base-cased
	2] bert-large-uncased
	3] bert-large-cased
	4] dccuchile/bert-base-spanish-wwm-uncased
	5] dccuchile/bert-base-spanish-wwm-uncased
	6] dccuchile/albert-xxlarge-spanish


Select model:  6


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, 128)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, 128)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at dccuchile/albert-xxlarge-spanish and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(torch.mean(logits, dim=1), axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
output_dir = input('Enter output directory: ')
training_args = TrainingArguments(output_dir=output_dir,
                                  evaluation_strategy="epoch", 
                                  learning_rate=2e-05,
                                  num_train_epochs=4.0,
                                  save_strategy="epoch",
                                  load_best_model_at_end=True,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2
                                  )
#alberta_xxl_spanish_multiclass

Enter output directory:  alberta_xxl_spanish_multiclass


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [15]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 