In [2]:
# Connecting to drive to obtain files, and to allow saving to drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
# Load the data
import numpy as np
import pandas as pd

train_roberta_new = pd.read_csv("/content/drive/MyDrive/RoBERTa_model/Train.csv")
validation_roberta = pd.read_csv("/content/drive/MyDrive/RoBERTa_model/Validation.csv")
test_roberta = pd.read_csv("/content/drive/MyDrive/RoBERTa_model/task_A_En_test.csv")

In [2]:
#inspect training data
train_roberta_new.head()
train_roberta_new.tail()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic
3117,3462,In case nobody is watching the MINNvsBAMA game...,0
3118,3463,The population spike in Chicago in 9 months is...,0
3119,3465,Im finally surfacing after a holiday to Scotla...,0
3120,3466,Couldn't be prouder today. Well done to every ...,0
3121,3467,Overheard as my 13 year old games with a frien...,0


In [3]:
#take only useful columns from train data (e.g. tweet and sarcastic)
train_roberta_new = train_roberta_new[['tweet', 'sarcastic']]
train_roberta_new.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not forced to g...,1
4,"I did too, and I also reported Cancun Cruz ...",1


In [5]:
# train_roberta_new['tweet']
train_roberta_new['tweet'] = train_roberta_new['tweet'].astype(str)
validation_roberta['tweet'] = validation_roberta['tweet'].astype(str)

In [6]:
#clean text function
import re

def clean_text(text):
    if isinstance(text, str): # ensures text is a string before cleaning
        text = re.sub(r'@[A-Za-z0-9]+', '', text)
        text = re.sub(r'#', '', text)
        text = re.sub(r'RT[\s]+', '', text)
        text = re.sub(r'https?:\/\/\S+', '', text)
        text = re.sub(r':', '', text)
        text = re.sub(r'‚Ä¶', '', text)
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        return text
    # if text is not a string, return an empty string
    else:
        return ''

In [7]:
#apply clean text to train, validation and test data
train_roberta_new['tweet'] = train_roberta_new['tweet'].apply(clean_text)
validation_roberta['tweet'] = validation_roberta['tweet'].apply(clean_text)
test_roberta['text'] = test_roberta['text'].apply(clean_text)

In [9]:
#split X and y for all sets
X_train = train_roberta_new['tweet']
y_train = train_roberta_new['sarcastic']

X_val = validation_roberta['tweet']
y_val = validation_roberta['sarcastic']

X_test = test_roberta['text']
y_test = test_roberta['sarcastic']

In [10]:
#perform oversampling on training set
#(not on validation or test; validation should reflect test set, and test set shouldn't be resampled)
from imblearn.over_sampling import RandomOverSampler

# needed to reshape X_train so that it has the appropriate format
X_train_for_oversampling = X_train.values.reshape(-1,1)

rosy = RandomOverSampler(random_state = 42)
X_train_resampled, y_train_resampled = rosy.fit_resample(X_train_for_oversampling, y_train)

In [12]:
# Check the original datatype and shape
print(type(X_train))  
print(X_train.shape)  

<class 'pandas.core.series.Series'>
(3122,)


In [11]:
# Convert the DataFrame back to Series, so it has the right format for 
X_train_resampled = pd.Series(X_train_resampled.squeeze(), name=X_train.name)

# Verify the result
print(type(X_train_resampled)) 
print(X_train_resampled.head())

<class 'pandas.core.series.Series'>
0    The only thing I got from college is a caffein...
1    I love it when professors draw a big question ...
2    Remember the hundred emails from companies whe...
3    Today my pop-pop told me I was not forced to g...
4       I did too, and I also reported Cancun Cruz ...
Name: tweet, dtype: object


In [12]:
# Tokenization
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize train, validation and test data
train_encodings = tokenizer(X_train_resampled.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# Convert to tensors
import torch

class sarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = sarcasmDataset(train_encodings, y_train_resampled)
val_dataset = sarcasmDataset(val_encodings, y_val)
test_dataset = sarcasmDataset(test_encodings, y_test)


In [18]:
# Needed to get accuracy
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [

In [19]:
#session needs to be restarted after running this

!pip install --upgrade accelerate
!pip install --upgrade transformers
!pip install --upgrade torch

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w



In [14]:
# Loading, instantiating and training the model

from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import evaluate
from datasets import load_metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Loading pretrained model
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Function that gives the evaluation metrics
def compute_metrics2(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # computing metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
      "accuracy": accuracy,
      "precision": precision,
      "recall": recall,
      "f1": f1
    }

# Defining the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    logging_strategy="steps",
)

# Defining the trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics2,
)

# Training
trainer.train()


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.6961,0.683205,0.710983,0.505496,0.710983,0.590884
20,0.6938,0.682277,0.710983,0.505496,0.710983,0.590884
30,0.6955,0.679353,0.710983,0.505496,0.710983,0.590884
40,0.698,0.680724,0.710983,0.505496,0.710983,0.590884
50,0.6955,0.686135,0.713873,0.700523,0.713873,0.602704
60,0.6935,0.687216,0.716763,0.685487,0.716763,0.622793
70,0.6905,0.690098,0.621387,0.629185,0.621387,0.625057
80,0.6967,0.69116,0.557803,0.62368,0.557803,0.577664
90,0.6893,0.688658,0.641618,0.619692,0.641618,0.628708
100,0.6905,0.6947,0.479769,0.703725,0.479769,0.475804


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=885, training_loss=0.4292625893307271, metrics={'train_runtime': 24409.9823, 'train_samples_per_second': 0.579, 'train_steps_per_second': 0.036, 'total_flos': 994791039211800.0, 'train_loss': 0.4292625893307271, 'epoch': 3.0})

In [15]:
# Evaluation metrics on the validation set
trainer.evaluate()

{'eval_loss': 0.6604750156402588,
 'eval_accuracy': 0.7052023121387283,
 'eval_precision': 0.7124111044546271,
 'eval_recall': 0.7052023121387283,
 'eval_f1': 0.7084351539159736,
 'eval_runtime': 67.9976,
 'eval_samples_per_second': 5.088,
 'eval_steps_per_second': 0.088,
 'epoch': 3.0}

In [16]:
# Evaluation metrics on the test set
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.6822778582572937,
 'eval_accuracy': 0.7135714285714285,
 'eval_precision': 0.8362754866676044,
 'eval_recall': 0.7135714285714285,
 'eval_f1': 0.7529731216368912,
 'eval_runtime': 585.0613,
 'eval_samples_per_second': 2.393,
 'eval_steps_per_second': 0.038,
 'epoch': 3.0}

In [17]:
# Predict function that gives its predictions for the test set, and the evaluation metrics of its performance on the test set
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-0.44080842,  0.36884162],
       [ 0.9126409 , -0.8817768 ],
       [-0.01099393, -0.0131508 ],
       ...,
       [ 1.030128  , -1.1494087 ],
       [-1.4855938 ,  1.5275781 ],
       [ 0.36402643, -0.3893823 ]], dtype=float32), label_ids=array([0, 0, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.6822778582572937, 'test_accuracy': 0.7135714285714285, 'test_precision': 0.8362754866676044, 'test_recall': 0.7135714285714285, 'test_f1': 0.7529731216368912, 'test_runtime': 604.7392, 'test_samples_per_second': 2.315, 'test_steps_per_second': 0.036})

In [18]:
# Saving the model to the drive
trainer.save_model("/content/drive/MyDrive/RoBERTa_model")