In [1]:
!pip install transformers --quiet
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install openpyxl
!pip install torch
!pip install -U scikit-learn scipy matplotlib --quiet
!pip install --upgrade accelerate



In [2]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import evaluate
from sklearn.metrics import classification_report
import datasets
import math
import torch


In [3]:
#### CLEANUP


# Load labeled dataset of tweets related to specific life events
df = pd.read_excel('dataset/LabeledTweets.xlsx', names=['text', 'label'])
print(df.isna().sum())  # print the number of NaN values in each column
df

text     0
label    0
dtype: int64


Unnamed: 0,text,label
0,"Enthusiasm is rare, Endurance is rare.",GRADUATION
1,That amazing moment!,GRADUATION
2,Work hard. Stay humble.;.;.;#graduating #gradu...,GRADUATION
3,@nomi_9867 @BVBoni17 Education is key 🖍️,GRADUATION
4,Big journey begins with small steps.;.;.;#gra...,GRADUATION
...,...,...
871,"Was just told that my dear friend, Dr. Ramin O...",DEATH_OF_A_LOVED_ONE
872,"@sy_fyn_ity I'm so sorry, how awful. My dad di...",DEATH_OF_A_LOVED_ONE
873,my other grandpa just died by a heart attack,DEATH_OF_A_LOVED_ONE
874,My grandma died last night. I knew it was comi...,DEATH_OF_A_LOVED_ONE


In [4]:
df = df.sample(frac=1)
df

Unnamed: 0,text,label
840,@bennyjohnson - congratulations are in order! ...,PREGNANCY
252,To get to marry my best friend is a dream tha...,WEDDING
391,"Unfortunate turn of events – my laptop, contai...",DAMEGED_OR_STOLEN_PROPERTY
137,@gachaheat_zay will marry you!,WEDDING
794,"Health challenges can feel isolating, but I've...",SERIOUS_HEALTH_CONDITION
...,...,...
0,"Enthusiasm is rare, Endurance is rare.",GRADUATION
185,"What's the craziest ""job quitting"" story you'...",QUIT_JOB
417,"Today, I became the proud owner of a classic c...",CAR_PURCHASE
162,Four More shots! Actress gets engaged : Share...,ENGAGEMENT


In [5]:
label_encoder = LabelEncoder()
BATCH_SIZE = 16
MODEL = 'cardiffnlp/twitter-roberta-base-sep2022'
dataset_dict = {}

train_df = df.head(math.trunc(len(df)*0.8))
test_df = df.tail(math.trunc(len(df)*0.2))
val_df = test_df.tail(BATCH_SIZE*4)

# Convert the subset DataFrame to a dictionary
train_dict = train_df.to_dict(orient='list')
test_dict = test_df.to_dict(orient='list')
val_dict = val_df.to_dict(orient='list')

# Fit the label encoder on the labels in the training dataset
# Transform the string labels into integer representations

label_encoder.fit(train_dict['label'])
train_dict['label'] = label_encoder.transform(train_dict['label'])

label_encoder.fit(test_dict['label'])
test_dict['label'] = label_encoder.transform(test_dict['label'])

label_encoder.fit(val_dict['label'])
val_dict['label'] = label_encoder.transform(val_dict['label'])


train_dict['label'] = [int(x) for x in train_dict['label']]
test_dict['label'] = [int(x) for x in test_dict['label']]
val_dict['label'] = [int(x) for x in val_dict['label']]


# Create a Dataset from the dictionary
train_dataset = datasets.Dataset.from_dict(train_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)

print(train_dataset)
print(test_dataset)
print(val_dataset)

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

train_dataset = train_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=True), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=True), batched=True)
val_dataset = val_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=True), batched=True)


Dataset({
    features: ['text', 'label'],
    num_rows: 700
})
Dataset({
    features: ['text', 'label'],
    num_rows: 175
})
Dataset({
    features: ['text', 'label'],
    num_rows: 64
})


OSError: cardiffnlBATCH_SIZEp/twitter-roberta-base-sep2022 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig
from torch.utils.data import DataLoader

# Load the tokenizer and model configuration
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sep2022')
config = RobertaConfig.from_pretrained('cardiffnlp/twitter-roberta-base-sep2022')

# Create the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'cardiffnlp/twitter-roberta-base-sep2022',
    config=config,
)

# Modify the model's classification head
num_labels = 2  # Adjust this based on your specific classification task
model.classifier = torch.nn.Linear(config.hidden_size, num_labels)

num_epochs = 3

# Create DataLoaders for training and evaluation
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
print(train_dataloader)

# Define the optimizer and learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Fine-tuning loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        inputs = torch.tensor(batch['text'])
        labels = torch.tensor(batch['label'])

        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
predictions = []
ground_truth = []
with torch.no_grad():
    for batch in eval_dataloader:
        inputs = batch['text']
        labels = batch['label']
        outputs = model(inputs)
        predicted_labels = torch.argmax(outputs.logits, dim=1)

        predictions.extend(predicted_labels.tolist())
        ground_truth.extend(labels.tolist())

# Calculate evaluation metrics based on predictions and ground truth labels
accuracy = (torch.tensor(predictions) == torch.tensor(ground_truth)).float().mean().item()

# Print the evaluation results
print(f"Accuracy: {accuracy}")