In [25]:
import pandas as pd
from transformers import LukeTokenizer, LukeForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm 

file_path = './dataset.csv'
dataset = pd.read_csv(file_path)

data = dataset.sample(frac=0.01, random_state=42)
data.reset_index(drop=True)
data = data.to_dict(orient="records")


# label2id = {"O": 0, "B-MENTAL_HEALTH_CONCERN": 1, "I-MENTAL_HEALTH_CONCERN": 2}

In [26]:

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
model = LukeForTokenClassification.from_pretrained("studio-ousia/luke-base", num_labels=3)

from transformers import LukeTokenizer

label2id = {"O": 0, "CONCERN": 1}

def label_concern_phrases(user_input, extracted_concern, tokenizer):
    # Tokenize with padding enabled
    tokens = tokenizer(user_input, return_tensors="pt", padding="max_length", truncation=True , max_length=512)
    labels = [label2id["O"]] * len(tokens["input_ids"][0])

    # Find positions of the extracted concern phrase within the input text
    start_idx = user_input.find(extracted_concern)
    if start_idx != -1:
        end_idx = start_idx + len(extracted_concern)
        concern_tokens = tokenizer(extracted_concern, add_special_tokens=False, return_tensors="pt")["input_ids"][0]

        # Mark tokens in extracted concern as "CONCERN"
        for i, token_id in enumerate(tokens["input_ids"][0]):
            if token_id in concern_tokens:
                labels[i] = label2id["CONCERN"]

    # Pad labels to match input length after tokenization
    labels += [label2id["O"]] * (tokens["input_ids"].shape[1] - len(labels))
    return tokens, labels


Some weights of LukeForTokenClassification were not initialized from the model checkpoint at studio-ousia/luke-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from torch.utils.data import Dataset, DataLoader

class ConcernDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            user_input = self.data[idx]["User Input"]
            extracted_concern = self.data[idx]["Extracted Concern"]
            tokens, labels = label_concern_phrases(user_input, extracted_concern, self.tokenizer)

            # Convert tokens and labels to tensors
            return {
                "input_ids": tokens["input_ids"].squeeze(0),
                "attention_mask": tokens["attention_mask"].squeeze(0),
                "labels": torch.tensor(labels, dtype=torch.long)
            }
        except KeyError as e:
            print(f"KeyError at index {idx}: {e}")
            raise



In [28]:
dataset = ConcernDataset(data, tokenizer)
from transformers import DataCollatorForTokenClassification

# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Update DataLoader to use the collate function
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=data_collator)


In [29]:
optimizer = AdamW(model.parameters(), lr=3e-5)
epochs = 5



In [30]:
# for batch in tqdm(train_dataloader):
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     labels = batch['labels']
    
#     # Print shapes
#     print(f"input_ids shape: {input_ids.shape}")
#     print(f"attention_mask shape: {attention_mask.shape}")
#     print(f"labels shape: {labels.shape}")
#     break

In [31]:
def train(model, train_dataloader, optimizer, epochs=3):
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = 0
        
        for batch in tqdm(train_dataloader):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Zero gradients before each step
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            # scheduler.step()
        
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Average loss for Epoch {epoch + 1}: {avg_loss:.4f}")

# Start training
train(model, train_dataloader, optimizer)


Epoch 1/3


100%|██████████| 125/125 [20:37<00:00,  9.90s/it]


Average loss for Epoch 1: 0.0170
Epoch 2/3


 11%|█         | 14/125 [02:16<19:19, 10.45s/it]

In [None]:
from transformers import LukeForTokenClassification, LukeTokenizer

# Define paths for saving
save_directory = "./saved_model_luke"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
model = LukeForTokenClassification.from_pretrained(save_directory)
tokenizer = LukeTokenizer.from_pretrained(save_directory)


model.eval()

def predict_concerns(text, tokenizer, model, label2id):
    # Tokenize the input text with padding and truncation
    tokens = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # Perform prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Get the predicted label IDs
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    # Convert label IDs to label names
    id2label = {v: k for k, v in label2id.items()}
    predicted_labels = [id2label[label_id] for label_id in predictions]

    # Convert tokens to readable text
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())

    # Only display tokens labeled as concerns
    results = []
    for token, label in zip(tokens, predicted_labels):
        if label != "O":  # Show only tokens with 'CONCERN' labels
            results.append((token, label))
            print(f"{token}: {label}")

    return results
