In [1]:
from datasets import Dataset, load_dataset, DatasetDict, ClassLabel, concatenate_datasets
import numpy as np
import torch
import pandas as pd

#Seeding for deterministic results i.e. showing same output 
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
function_mapping = {'OTHER': ['anticipation', 'joy', 'love', 'optimism', 'surprise', 'trust'],
           'NOT_INTERESTED': [''], 
           'DISLIKE':['disgust'], 
           'NOT_CORRECT': [''], 
           'PESSIMISTIC':['sadness', 'pessimism'], 
           'WORRIED':['fear'], 
           'ANGRY': ['anger'], 
           'DISAPPOINTED': [''], 
           'BORED': [''], 
           'NOT_APPROVE':[''], 
           'NOT_IMPORTANT': [''], 
           'DISAGREE': [''], 
           'WARN': [''], 
           'COMPLAIN': [''], 
           'THREATEN': [''], 
           'UNWILLING': [''], 
           'DISTRUST' : [''],
           'REFUSE': [''] }

ait_es_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-Es-train.txt',
                                                        'test': r'2018-E-c-Es-test-gold.txt',
                                                       'valid': r'2018-E-c-Es-dev.txt'})
ait_en_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-En-train.txt',
                                                        'test': r'2018-E-c-En-test-gold.txt',
                                                       'valid': r'2018-E-c-En-dev.txt'})
ait_ar_dataset = load_dataset("csv", sep = '\t', data_files ={'train': r'2018-E-c-Ar-train.txt',
                                                        'test': r'2018-E-c-Ar-test-gold.txt',
                                                       'valid': r'2018-E-c-Ar-dev.txt'})
train_dataset = concatenate_datasets([ait_es_dataset['train'], ait_en_dataset['train'], ait_ar_dataset['train']])
valid_dataset = concatenate_datasets([ait_es_dataset['valid'], ait_en_dataset['valid'], ait_ar_dataset['valid']])
test_dataset = concatenate_datasets([ait_es_dataset['test'], ait_en_dataset['test'], ait_ar_dataset['test']])

ait_dataset = DatasetDict({'train': train_dataset.shuffle(seed=42), 'valid': valid_dataset.shuffle(seed=42), 'test': test_dataset.shuffle(seed=42)})

Using custom data configuration default-051610f9ffe8f8d2
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-051610f9ffe8f8d2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 695.38it/s]
Using custom data configuration default-94907eb33ba58000
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-94907eb33ba58000/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 697.42it/s]
Using custom data configuration default-869e6abb15784ff2
Found cached dataset csv (/home/njfernandez/.cache/huggingface/datasets/csv/default-869e6abb15784ff2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 733.74it/s]
Loading cached shuffled indices for dataset at /home/njfernandez/.cache/huggingface/datasets/csv/default-051610f9ffe8f8d2/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f19

In [3]:
ait_dataset['train'][20]

{'ID': '2018-Ar-00161',
 'Tweet': '...\\nالسكوت عمره ما كان علامة رضا السكوت نفاذ صبر ووجع بس انتو اللي مبتحسوش...💔\\n\\n....',
 'anger': 1,
 'anticipation': 0,
 'disgust': 1,
 'fear': 0,
 'joy': 0,
 'love': 0,
 'optimism': 0,
 'pessimism': 0,
 'sadness': 1,
 'surprise': 0,
 'trust': 0}

In [4]:
new_dataset = DatasetDict()
for split in ait_dataset:
    new_split = []
    for record in ait_dataset[split]:
        new_record = {'Tweet': record['Tweet']}
        for function in function_mapping:
            labels = function_mapping[function]
            if '' in labels:
                continue
            else:
                score = sum([record[label] for label in labels])
                new_record[function] = int(score > 0)
        new_split.append(new_record)
    ait_dataset[split] =  Dataset.from_pandas(pd.DataFrame(data=new_split))

In [5]:
ait_dataset['train'][20]

{'Tweet': '...\\nالسكوت عمره ما كان علامة رضا السكوت نفاذ صبر ووجع بس انتو اللي مبتحسوش...💔\\n\\n....',
 'OTHER': 0,
 'DISLIKE': 1,
 'PESSIMISTIC': 1,
 'WORRIED': 0,
 'ANGRY': 1}

In [6]:
ait_dataset

DatasetDict({
    train: Dataset({
        features: ['Tweet', 'OTHER', 'DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY'],
        num_rows: 12675
    })
    valid: Dataset({
        features: ['Tweet', 'OTHER', 'DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY'],
        num_rows: 2150
    })
    test: Dataset({
        features: ['Tweet', 'OTHER', 'DISLIKE', 'PESSIMISTIC', 'WORRIED', 'ANGRY'],
        num_rows: 7631
    })
})

In [7]:
cols = ait_dataset['train'].column_names
ait_dataset = ait_dataset.map(lambda x : {"functions": [c for c in cols if x[c] == 1]})
ait_dataset['train'][20]

100%|██████████| 12675/12675 [00:00<00:00, 13112.21ex/s]
100%|██████████| 2150/2150 [00:00<00:00, 13352.47ex/s]
100%|██████████| 7631/7631 [00:00<00:00, 13781.10ex/s]


{'Tweet': '...\\nالسكوت عمره ما كان علامة رضا السكوت نفاذ صبر ووجع بس انتو اللي مبتحسوش...💔\\n\\n....',
 'OTHER': 0,
 'DISLIKE': 1,
 'PESSIMISTIC': 1,
 'WORRIED': 0,
 'ANGRY': 1,
 'functions': ['DISLIKE', 'PESSIMISTIC', 'ANGRY']}

In [8]:
for split in ait_dataset.keys():
    new_examples = []
    for example in ait_dataset[split]:
        for label in example['functions']:
            new_example = {'text': example['Tweet'], 'function': label}
            new_examples.append(new_example)
    ait_dataset[split] = Dataset.from_pandas(pd.DataFrame(data=new_examples))

In [9]:
ait_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'function'],
        num_rows: 20918
    })
    valid: Dataset({
        features: ['text', 'function'],
        num_rows: 3448
    })
    test: Dataset({
        features: ['text', 'function'],
        num_rows: 12172
    })
})

In [10]:
comfunctions_basic = 'comfunct_basic.txt'
df = pd.read_csv(comfunctions_basic, delimiter=";")
df = df[df['function'].isin(['ANGRY', 'WORRIED', 'PESSIMISTIC', 'DISLIKE', 'OTHER'])]
df

Unnamed: 0,text,function
85,I'm not really very keen on it,DISLIKE
86,I'm not really very keen,DISLIKE
87,I'm not really keen on it,DISLIKE
88,I'm not really keen,DISLIKE
89,I'm not very keen on it,DISLIKE
...,...,...
897,you could do it,OTHER
898,you look nice,OTHER
899,you look smart,OTHER
900,you're right,OTHER


In [11]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df.reset_index(drop=True)).shuffle(seed=42),
    "valid": ait_dataset['valid'],
    "test": ait_dataset['test']
}
    )

In [12]:
dataset['test'][37]

{'text': 'Thank you for the M line construction @MTA . Sincerely, all J train riders Essex to Bway Junction. #nycsubway ',
 'function': 'ANGRY'}

In [13]:
dataset = dataset.class_encode_column("function")
labels = dataset['train'].features['function']
dataset = dataset.rename_column("function", "label")
print(labels.names)

Casting to class labels: 100%|██████████| 1/1 [00:00<00:00, 124.74ba/s]
Casting to class labels: 100%|██████████| 4/4 [00:00<00:00, 558.07ba/s]
Casting to class labels: 100%|██████████| 13/13 [00:00<00:00, 538.47ba/s]

['ANGRY', 'DISLIKE', 'OTHER', 'PESSIMISTIC', 'WORRIED']





In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

transformer_model = 'cardiffnlp/twitter-xlm-roberta-base'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(transformer_model, num_labels = len(labels.names))).to(device)
tokenizer=AutoTokenizer.from_pretrained(transformer_model)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

ds_enc = dataset.map(tokenize, batched=True, batch_size=None)
data_collator = DataCollatorWithPadding(tokenizer, padding="longest")
ds_enc

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 410
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3448
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12172
    })
})

In [15]:
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [16]:
from transformers import TrainingArguments, Trainer

batch_size = 16
model_name = "functions_basic_finetuning_single_label"
training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=5,
        learning_rate = 2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_ratio=0.1,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model = "f1",
        load_best_model_at_end=True,
        save_total_limit = 1,        
        report_to='none',
    )

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=ds_enc["train"],
                  eval_dataset=ds_enc["valid"],                    
                  tokenizer=tokenizer,
                  data_collator=data_collator,)
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 410
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 65
  Number of trainable parameters = 278047493
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.560552,0.065951,0.2,0.099193
2,No log,1.588128,0.121114,0.202544,0.105932
3,No log,1.559213,0.187786,0.257873,0.200031
4,No log,1.553783,0.18722,0.262898,0.206373
5,No log,1.554651,0.185107,0.269109,0.21168


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3448
  Batch size = 32
Saving model checkpoint to functions_basic_finetuning_single_label/checkpoint-13
Configuration saved in functions_basic_finetuning_single_label/checkpoint-13/config.json
Model weights saved in functions_basic_finetuning_single_label/checkpoint-13/pytorch_model.bin
tokenizer config file saved in functions_basic_finetuning_single_label/checkpoint-13/tokenizer_config.json
Special tokens file saved in functions_basic_finetuning_single_label/checkpoint-13/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expect

TrainOutput(global_step=65, training_loss=1.086111567570613, metrics={'train_runtime': 142.1131, 'train_samples_per_second': 14.425, 'train_steps_per_second': 0.457, 'total_flos': 15802505626500.0, 'train_loss': 1.086111567570613, 'epoch': 5.0})

In [None]:
preds_output = trainer.predict(ds_enc['test'])
print(preds_output.metrics)

The following columns in the test set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 12172
  Batch size = 32


In [None]:
from sklearn.metrics import classification_report

y_true = np.array(ds_enc['test']["label"])
y_pred = preds_output.predictions.argmax(-1)

report = classification_report(
  y_true,
  y_pred,
  target_names=labels.names,
  zero_division=0
)
print(report)

In [None]:
report = classification_report(
  y_true,
  y_pred,
  target_names=labels.names,
  zero_division=0,
    output_dict=True
)

df = pd.DataFrame(report).transpose()
with open(r'classification_report_functions_basic_ait_finetuning_singlelabel.csv', 'w') as csv_file:
    df.to_csv(path_or_buf=csv_file)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):    
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(50, 50))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)  
    font = {'family' : 'DejaVu Sans',
        'weight' : 'bold',
        'size'   : 16}
    plt.rc('font', **font)
    ax.tick_params(axis='x', which='major', labelsize=15)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(ds_enc["test"]["label"])
plot_confusion_matrix(y_preds, y_valid, labels.names)

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device), 
                             reduction="none")     
    return {"loss": loss.cpu().numpy(), 
            "predicted_label": pred_label.cpu().numpy()}

ds_enc.set_format("torch", 
                            columns=["input_ids", "attention_mask", "label"])
ds_enc["test"] = ds_enc["test"].map(
    forward_pass_with_label, batched=True, batch_size=16)

In [None]:
def label_int2str(row):
    return dataset["train"].features["label"].int2str(row)

In [None]:
ds_enc.set_format("pandas")
cols = ["text", 'label', "predicted_label", "loss"]
df_test = ds_enc["test"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                              .apply(label_int2str))

In [None]:
loss_values = df_test.sort_values("loss", ascending=False).head(50)
loss_values

In [None]:
loss_values.to_csv('loss_values_functions_basic_ait_finetuning_singlelabel.tsv',header =True, sep = '\t',index=False)
df_test.to_csv('preds_functions_basic_ait_singlelabel.tsv',header =True, sep = '\t',index=False)