In [55]:
import pandas as pd
from transformers import LukeTokenizer, LukeForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm 

file_path = './dataset.csv'
dataset = pd.read_csv(file_path)

data = dataset.sample(frac=0.01, random_state=42)
data.reset_index(drop=True)
data = data.to_dict(orient="records")


# label2id = {"O": 0, "B-MENTAL_HEALTH_CONCERN": 1, "I-MENTAL_HEALTH_CONCERN": 2}

In [56]:

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
model = LukeForTokenClassification.from_pretrained("studio-ousia/luke-base", num_labels=3)

def label_concern_phrases(user_input, extracted_concern, tokenizer):
    # Tokenize without `is_split_into_words`
    tokens = tokenizer(user_input, return_tensors="pt")
    labels = ["O"] * len(tokens["input_ids"][0])  # Start with all tokens labeled as outside (O)

    # Find positions of the extracted concern phrase within the input text
    start_idx = user_input.find(extracted_concern)
    if start_idx != -1:
        end_idx = start_idx + len(extracted_concern)
        concern_tokens = tokenizer(extracted_concern, add_special_tokens=False, return_tensors="pt")["input_ids"][0]

        # Mark tokens in extracted concern as "CONCERN"
        for i, token_id in enumerate(tokens["input_ids"][0]):
            if token_id in concern_tokens:
                labels[i] = "CONCERN"

    return tokens, labels


Some weights of LukeForTokenClassification were not initialized from the model checkpoint at studio-ousia/luke-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
from torch.utils.data import Dataset, DataLoader

class ConcernDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            user_input = self.data[idx]["User Input"]
            extracted_concern = self.data[idx]["Extracted Concern"]
            tokens, labels = label_concern_phrases(user_input, extracted_concern, self.tokenizer)

            # Convert tokens and labels to tensors
            return {
                "input_ids": tokens["input_ids"].squeeze(0),
                "attention_mask": tokens["attention_mask"].squeeze(0),
                "labels": torch.tensor(labels, dtype=torch.long)
            }
        except KeyError as e:
            print(f"KeyError at index {idx}: {e}")
            raise


In [58]:
dataset = ConcernDataset(data, tokenizer)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [59]:
optimizer = AdamW(model.parameters(), lr=3e-5)
epochs = 5



In [60]:
for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

  0%|          | 0/125 [00:00<?, ?it/s]


ValueError: too many dimensions 'str'

In [46]:
def train(model, train_dataloader, optimizer, epochs=3):
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = 0
        
        for batch in tqdm(train_dataloader):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Zero gradients before each step
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            # scheduler.step()
        
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Average loss for Epoch {epoch + 1}: {avg_loss:.4f}")

# Start training
train(model, train_dataloader, optimizer)
exit()

Epoch 1/3


  0%|          | 0/125 [00:00<?, ?it/s]

KeyError at index 681: 681





KeyError: 681

In [None]:
def extract_concerns(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print("Tokens and Labels:")
    for token, label in zip(tokens, predictions[0]):
        print(f"Token: {token}, Label ID: {label}")
    concern_phrase = []
    for token, label in zip(tokens, predictions[0]):
        if token in ["<s>", "</s>", "[PAD]"]:
            continue
        if label == 1 or label == 2:
            concern_phrase.append(token.replace("##", ""))
        elif concern_phrase:
            break
    return " ".join(concern_phrase)

text = "I’ve been feeling very anxious and stressed lately."
extracted_concern = extract_concerns(text)
print("Extracted Concern:", extracted_concern)