In [None]:
import wandb
wandb.login(key='')

In [20]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer


from tensorflow.keras.models import Model
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
import nltk as nltk

warnings.filterwarnings("ignore")
train = pd.read_csv("/kaggle/input/train2-csv/train.csv", sep=",")

test = train.sample(
    frac=0.1
)
train = train.drop(test.index)

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")

# Initialize stopwords, tokenizer, and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Remove URLs, mentions, and special characters
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize the tokens
    tokens = [
        lemmatizer.lemmatize(word.lower())
        for word in tokens
        if word.lower() not in stop_words
    ]
    return " ".join(tokens)


# Apply preprocessing to the text data
train["full_text"] = train["full_text"].apply(preprocess_text)
test["full_text"] = test["full_text"].apply(preprocess_text)
testLabels = test['label']
train["Right"] = train["label"] == 0
train["Left"] = train["label"] == 1
train["Neutral"] = train["label"] == 2
train.drop(columns=["label"], inplace=True)
test["Right"] = test["label"] == 0
test["Left"] = test["label"] == 1
test["Neutral"] = test["label"] == 2
test.drop(columns=["label"], inplace=True)


labels = ["Right", "Left", "Neutral"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
id2label


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{0: 'Right', 1: 'Left', 2: 'Neutral'}

In [21]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
import os
from datasets import Dataset

from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_dataset = Dataset.from_pandas(train)
hf_test = Dataset.from_pandas(test)
def preprocess_data(examples):
    text = examples["full_text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    # Collect label values for each label in 'labels'
    labels_matrix = np.zeros((len(text), len(labels)))
    for i, lbl in enumerate(labels):
        if lbl in examples:
            labels_matrix[:, i] = examples[lbl]
    encoding["labels"] = labels_matrix.tolist()
    return encoding

# Map the function over the dataset, removing original columns so only the tokenized output remains
encoded_dataset = hf_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=hf_dataset.column_names,
)
encoded_test = hf_test.map(
    preprocess_data,
    batched=True,
    remove_columns=hf_test.column_names,
)
train_test_split = encoded_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/776 [00:00<?, ? examples/s]

In [22]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id)

from transformers import TrainingArguments, Trainer



def adjustArguments(lr, epochs, batch_size):
    metric_name = "f1"
    args = TrainingArguments(
        f"bert-finetuned-twitter_sentiment_analysis",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        gradient_accumulation_steps=2,  # Use gradient accumulation
        lr_scheduler_type="linear",  # Use a learning rate scheduler
        #push_to_hub=True,
    )
    return args

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [25]:
import os

# Set this environment variable before importing or running torch code:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


import torch


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)



In [26]:

def startTraining(args):
    trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

In [27]:
def saveModel(trainer, message):
    trainer.push_to_hub(message)

In [18]:
from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
import numpy as np
import torch
from sklearn.metrics import f1_score, confusion_matrix

def predict(trainer):
    sigmoid = torch.nn.Sigmoid()
    
    all_preds = []
    
    # Inference loop
    for i in range(len(encoded_test)):
        batch = encoded_test[i]
        inputs = {k: torch.tensor(v).unsqueeze(0).to(trainer.model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    
        with torch.no_grad():
            outputs = trainer.model(**inputs)
            logits = outputs.logits
    
        # Apply sigmoid and threshold at 0.5
        probs = sigmoid(logits.squeeze().cpu())
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
    
        # Collect predictions and true labels
        all_preds.append(predictions)
    
    # Convert lists to numpy arrays
    all_preds = np.array(all_preds)
    
    # Convert one-hot encoded vectors to class indices
    all_preds = np.argmax(all_preds, axis=1)
    
    # Calculate F1 score (macro-average for multi-label)
    f1 = f1_score(testLabels, all_preds, average="macro")
    print(f"F1 score (macro): {f1}")
    
    # Print confusion matrix
    cm = confusion_matrix(testLabels, all_preds)
    print(f"Confusion matrix:\n{cm}")
    return f1

In [None]:
args = adjustArguments(3e-5, 5, 16)
trainer = startTraining(args)


In [34]:
saveModel(trainer, "Bert fine tuned model for twitter sentiment analysis")

F1 score (macro): 0.6455076749753555
Confusion matrix:
[[ 34  25  23]
 [ 23 478  13]
 [ 41  35 104]]


events.out.tfevents.1736691325.90b8ee4f6ada.40.1:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1736692483.90b8ee4f6ada.40.2:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1736688658.90b8ee4f6ada.40.0:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]