# Install Dependencies

In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install datasets
!pip install transformers

Collecting pip
  Downloading pip-22.0.3-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 29.0 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.0.3
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
[0mCollecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 KB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[2K     [

# Fine-tuning XLM-T

This notebook was modified from https://huggingface.co/transformers/custom_datasets.html

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING']='1'

torch.cuda.empty_cache()

## Parameters

In [None]:
LR = 2e-5
EPOCHS = 1
BATCH_SIZE = 32
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment" # use this to finetune the sentiment classifier
MAX_TRAINING_EXAMPLES = -1 # set this to -1 if you want to use the whole training set

## Data

We download the xml-t sentiment dataset (`UMSAB`).


In [None]:
# loading dataset for UMSAB's all 8 languages

files = """test_labels.txt
test_text.txt
train_labels.txt
train_text.txt
val_labels.txt
val_text.txt""".split('\n')

for f in files:
  p = f"https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/{f}"
  !wget $p

--2022-02-14 13:43:18--  https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/test_labels.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13919 (14K) [text/plain]
Saving to: ‘test_labels.txt.1’


2022-02-14 13:43:18 (71.4 MB/s) - ‘test_labels.txt.1’ saved [13919/13919]

--2022-02-14 13:43:18--  https://raw.githubusercontent.com/cardiffnlp/xlm-t/main/data/sentiment/all/test_text.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 654172 (639K) [text/plain]
Saving to: ‘test_text.txt.1’


2022-02-14 13:43:18

In [None]:
dataset_dict = {}
for i in ['train','val','test']:
  dataset_dict[i] = {}
  for j in ['text','labels']:
    dataset_dict[i][j] = open(f"{i}_{j}.txt").read().split('\n')
    if j == 'labels':
      dataset_dict[i][j] = [int(x) for x in dataset_dict[i][j]]

if MAX_TRAINING_EXAMPLES > 0:
  dataset_dict['train']['text']=dataset_dict['train']['text'][:MAX_TRAINING_EXAMPLES]
  dataset_dict['train']['labels']=dataset_dict['train']['labels'][:MAX_TRAINING_EXAMPLES]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

In [None]:
train_encodings = tokenizer(dataset_dict['train']['text'], truncation=True, padding=True)
val_encodings = tokenizer(dataset_dict['val']['text'], truncation=True, padding=True)
test_encodings = tokenizer(dataset_dict['test']['text'], truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, dataset_dict['train']['labels'])
val_dataset = MyDataset(val_encodings, dataset_dict['val']['labels'])
test_dataset = MyDataset(test_encodings, dataset_dict['test']['labels'])

## Fine-tuning

The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model
to fine-tune, define the `TrainingArguments`/`TFTrainingArguments` and
instantiate a `Trainer`/`TFTrainer`.

In [None]:
training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    evaluation_strategy ='steps',
    eval_steps = 10,                          # evaluation and Save happens every 10 steps
    save_total_limit = 5,                     # only last 5 models are saved; older ones are deleted
    logging_steps=10,                         # when to print log
    load_best_model_at_end=True,              # load or not best model at the end
)

num_labels = len(set(dataset_dict["train"]["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

In [None]:
trainer = Trainer(
    model=model,                              # the instantiated Transformers model to be trained
    args=training_args,                       # training arguments, defined above
    train_dataset=train_dataset,              # training dataset
    eval_dataset=val_dataset                  # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 14712
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 460


Step,Training Loss,Validation Loss
10,0.6297,0.746945
20,0.6218,0.780576
30,0.6098,0.77939
40,0.571,0.805704
50,0.5771,0.813854
60,0.546,0.833704
70,0.6024,0.792982
80,0.6606,0.820057
90,0.7043,0.756601
100,0.6301,0.840525


***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** 

Step,Training Loss,Validation Loss
10,0.6297,0.746945
20,0.6218,0.780576
30,0.6098,0.77939
40,0.571,0.805704
50,0.5771,0.813854
60,0.546,0.833704
70,0.6024,0.792982
80,0.6606,0.820057
90,0.7043,0.756601
100,0.6301,0.840525


***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2592
  Batch size = 32
***** 

TrainOutput(global_step=460, training_loss=0.6370905264564183, metrics={'train_runtime': 1419.8491, 'train_samples_per_second': 10.362, 'train_steps_per_second': 0.324, 'total_flos': 1943022700419696.0, 'train_loss': 0.6370905264564183, 'epoch': 1.0})

## Evaluate on Test set

In [None]:
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

***** Running Prediction *****
  Num examples = 6960
  Batch size = 32


              precision    recall  f1-score   support

           0      0.717     0.742     0.729      2320
           1      0.667     0.597     0.630      2320
           2      0.698     0.748     0.722      2320

    accuracy                          0.696      6960
   macro avg      0.694     0.696     0.694      6960
weighted avg      0.694     0.696     0.694      6960



In [None]:
df = df_twitter = pd.read_csv('/content/sentimentdata.csv')
data = df.processed_text.values.tolist()
labels = dataset_dict['test']['labels'][0:536]

In [None]:
V_encodings = tokenizer(data, truncation=True, padding=True)

In [None]:
V_dataset = MyDataset(V_encodings, labels)

In [None]:
test_preds_raw, test_labels , _ = trainer.predict(V_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)

***** Running Prediction *****
  Num examples = 536
  Batch size = 32


In [None]:
ss = pd.DataFrame(test_preds)
df['labelFT'] = ss[0]