In [1]:
!pip install transformers --quiet
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install openpyxl  --quiet
!pip install torch  --quiet
!pip install -U scikit-learn scipy matplotlib --quiet
!pip install --upgrade accelerate  --quiet



In [2]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import evaluate
from sklearn.metrics import classification_report
import datasets
import math
import torch


In [3]:
#### CLEANUP


# Load labeled dataset of tweets related to specific life events
df = pd.read_excel('dataset/LabeledTweets.xlsx', names=['text', 'label'])
print(df.isna().sum())  # print the number of NaN values in each column
df

text     0
label    0
dtype: int64


Unnamed: 0,text,label
0,"Enthusiasm is rare, Endurance is rare.",GRADUATION
1,That amazing moment!,GRADUATION
2,Work hard. Stay humble.;.;.;#graduating #gradu...,GRADUATION
3,@nomi_9867 @BVBoni17 Education is key 🖍️,GRADUATION
4,Big journey begins with small steps.;.;.;#gra...,GRADUATION
...,...,...
871,"Was just told that my dear friend, Dr. Ramin O...",DEATH_OF_A_LOVED_ONE
872,"@sy_fyn_ity I'm so sorry, how awful. My dad di...",DEATH_OF_A_LOVED_ONE
873,my other grandpa just died by a heart attack,DEATH_OF_A_LOVED_ONE
874,My grandma died last night. I knew it was comi...,DEATH_OF_A_LOVED_ONE


In [4]:
df = df.sample(frac=1)
df

Unnamed: 0,text,label
781,It's time to share my story and raise awarenes...,ADDICTION_RECOVERY
252,To get to marry my best friend is a dream tha...,WEDDING
178,It's not always possible to offer promotions ...,WORK_PROMOTION
69,RT @HarkiratKukreja: We are so proud of our d...,GRADUATION
52,Congratulations Brystal!!! #classof2023 https:...,GRADUATION
...,...,...
339,Lost my job due to company downsizing. It's a ...,FIRED
653,With mortarboard on my head and a diploma in m...,GRADUATION
21,Congratulations to the graduating Class of 20...,GRADUATION
692,Resignation letter sent! I'm officially quitti...,QUIT_JOB


In [5]:
LR = 2e-5
EPOCHS = 30
BATCH_SIZE = 32
MODEL = 'cardiffnlp/twitter-roberta-base-sep2022'

In [6]:
# set transformers seed
seed = 223
set_seed(seed)

In [7]:
dataset_dict = {}

train_df = df.head(math.trunc(len(df)*0.8))
test_df = df.tail(math.trunc(len(df)*0.2))
val_df = test_df.tail(BATCH_SIZE*2)

# Convert the subset DataFrame to a dictionary
train_dict = train_df.to_dict(orient='list')
test_dict = test_df.to_dict(orient='list')
val_dict = val_df.to_dict(orient='list')

# Fit the label encoder on the labels in the training dataset
# Transform the string labels into integer representations

label_encoder.fit(train_dict['label'])
train_dict['label'] = label_encoder.transform(train_dict['label'])

label_encoder.fit(test_dict['label'])
test_dict['label'] = label_encoder.transform(test_dict['label'])

label_encoder.fit(val_dict['label'])
val_dict['label'] = label_encoder.transform(val_dict['label'])


train_dict['label'] = [int(x) for x in train_dict['label']]
test_dict['label'] = [int(x) for x in test_dict['label']]
val_dict['label'] = [int(x) for x in val_dict['label']]


# Create a Dataset from the dictionary
train_dataset = datasets.Dataset.from_dict(train_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)

print(train_dataset)
print(test_dataset)
print(val_dataset)

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

def tokenize_and_encode_labels(examples):
    tokenized_examples = tokenizer(examples['text'], truncation=True, padding=True)
    tokenized_examples['input_ids'] = tokenized_examples['input_ids']
    tokenized_examples['attention_mask'] = tokenized_examples['attention_mask']
    tokenized_examples['label'] = examples['label']
    return tokenized_examples

train_dataset.map(tokenize_and_encode_labels, batched=True)
test_dataset.map(tokenize_and_encode_labels, batched=True)
val_dataset.map(tokenize_and_encode_labels, batched=True)



NameError: name 'label_encoder' is not defined

In [61]:
tweets = list(df['text'])
labels = list(df['label'])

label_encoder = LabelEncoder()
label_encoder.fit(labels)

labels_encoded = list(label_encoder.transform(labels))



In [62]:
### Train test split 

from sklearn.model_selection import train_test_split 
tweets_train, tweets_test, labels_train, labels_test = train_test_split(tweets, labels_encoded, test_size = 0.20, random_state = 0)

In [63]:
#Tokenize 

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(MODEL, use_fast=True)

train_encodings = tokenizer(tweets_train, truncation=True, padding=True)
test_encodings = tokenizer(tweets_test, truncation=True, padding=True)

In [64]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [65]:

train_dataset = Dataset(train_encodings, labels_train)
test_dataset = Dataset(test_encodings, labels_test)

In [35]:
# Convert encodings to Dataset object
import torch
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(labels_train))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), torch.tensor(labels_test))


In [66]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [36]:
# Print test dataset

#print("Test Dataset:")
#for sample in test_dataset:
#    input_ids, attention_mask, label = sample
#    print("Input IDs:", input_ids)
#    print("Attention Mask:", attention_mask)
#    print("Label:", label)
#    print()

dict_keys(['input_ids', 'attention_mask'])

In [67]:
training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                          # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    logging_steps=160,                         # when to print log
    evaluation_strategy='steps',              # evaluate every n number of steps. 
    eval_steps=160,                            # how often to evaluate. If not set defaults to number of logging_steps
    load_best_model_at_end=True,              # to load or not the best model at the end
    save_steps=160,                            # create a checkpoint every time we evaluate,
    seed=seed                                 # seed for consistent results
)


In [48]:
from transformers import RobertaForSequenceClassification,  Trainer, TrainingArguments, RobertaConfig
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=32)
trainer = Trainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    tokenizer=tokenizer,                      # tokenizer to be used to pad the inputs 
    args=training_args,                       # training arguments, defined above
    train_dataset=train_dataset,              # training dataset
   # eval_dataset=val_dataset,                  # evaluation dataset
    callbacks = [EarlyStoppingCallback(3, 0.001)], # early stopping which stops the training after 3 evaluation calls with no improvement of performance of at least 0.001
)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.ou

In [68]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Step,Training Loss


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig
from torch.utils.data import DataLoader

# Load the tokenizer and model configuration
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sep2022')
config = RobertaConfig.from_pretrained('cardiffnlp/twitter-roberta-base-sep2022')
NUM_LABELS = 32  # Replace with the actual number of labels in your dataset

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS)


# Define the training arguments
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    evaluation_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,                              # the instantiated 🤗 Transformers model to be trained
    tokenizer=tokenizer,                      # tokenizer to be used to pad the inputs 
    args=training_args,                       # training arguments, defined above
    train_dataset=train_dataset,              # training dataset
   # eval_dataset=val_dataset,                  # evaluation dataset
   # callbacks = [EarlyStoppingCallback(3, 0.001)], # early stopping which stops the training after 3 evaluation calls with no improvement of performance of at least 0.001
)
# Fine-tune the model
trainer.train()

# Evaluate the fine-tuned model on the test dataset
test_encodings = tokenizer(test_dataset['text'], truncation=True, padding=True)
test_labels = test_dataset['label']
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_labels))
test_loader = DataLoader(test_dataset, batch_size=training_args.per_device_eval_batch_size)
result = trainer.evaluate(eval_dataloader=test_loader)

print(result)
