In [None]:
!pip install transformers accelerate sentencepiece

In [12]:
# -*- coding: utf-8 -*-
from utils import *
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from huggingface_hub import notebook_login

In [6]:
seed_torch(42)

cur_model = (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base')
m_name = 'Roberta'

train_df=pd.read_pickle('gh-train.pkl')
train_df['label']=train_df['label'].replace({'positive':1, 'negative':2, 'neutral':0})

tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)

sentences=train_df.sentence.values
labels=train_df.label.values

In [7]:
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(str(sent),
                                         add_special_tokens = True,
                                         max_length = MAX_LEN,
                                         pad_to_max_length = True,
                                         return_attention_mask = True,
                                         return_tensors = 'pt'
                                        )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
train_inputs = torch.cat(input_ids, dim=0)
train_masks = torch.cat(attention_masks, dim=0)
train_labels = torch.tensor(labels)

print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

Training data torch.Size([4985, 256]) torch.Size([4985, 256]) torch.Size([4985])


In [9]:
model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

begin=time.time()
train_loss_set = []

for _ in trange(EPOCHS, desc="Epoch"):

    model.train()

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, \
                        attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item())

        # Backward pass
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

end=time.time()
print('Used {} second'.format(end-begin))

Epoch:  25%|██▌       | 1/4 [03:17<09:52, 197.41s/it]

Train loss: 0.513338966247363


Epoch:  50%|█████     | 2/4 [06:35<06:35, 197.58s/it]

Train loss: 0.2184126437880481


Epoch:  75%|███████▌  | 3/4 [09:52<03:17, 197.59s/it]

Train loss: 0.14549750718288124


Epoch: 100%|██████████| 4/4 [13:10<00:00, 197.58s/it]

Train loss: 0.09885065501499085
Used 790.321807384491 second





In [11]:
### Test
begin=time.time()
test_df=pd.read_pickle('gh-test.pkl')

test_df['label']=test_df['label'].replace({
    'positive':1,
    'negative':2,
    'neutral':0})

sentences=test_df.sentence.values
labels = test_df.label.values

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        str(sent),
                        add_special_tokens = True,
                        max_length = MAX_LEN,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

prediction_inputs = torch.cat(input_ids,dim=0)
prediction_masks = torch.cat(attention_masks,dim=0)
prediction_labels = torch.tensor(labels)

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)

model.eval()
predictions,true_labels=[],[]

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

end=time.time()
print('Prediction used {:.2f} seconds'.format(end - begin))

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print("Accuracy of {} on GitHub is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))

print(classification_report(flat_true_labels,flat_predictions))



Prediction used 28.11 seconds
Accuracy of Roberta on GitHub is: 0.9241927936359382
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       893
           1       0.94      0.92      0.93       616
           2       0.93      0.90      0.92       628

    accuracy                           0.92      2137
   macro avg       0.93      0.92      0.92      2137
weighted avg       0.92      0.92      0.92      2137



In [None]:
# Uploading the model to huggingface-hub
notebook_login()

In [14]:
model.push_to_hub("gh-roberta-base-sentiment")
tokenizer.push_to_hub("gh-roberta-base-sentiment")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/marticampgin/gh-roberta-base-sentiment/commit/ce0e1849a5d5ec1ba0cb908a1b7e1d6820cf2afc', commit_message='Upload tokenizer', commit_description='', oid='ce0e1849a5d5ec1ba0cb908a1b7e1d6820cf2afc', pr_url=None, pr_revision=None, pr_num=None)