In [39]:
import pandas as pd
from transformers import LukeTokenizer, LukeForTokenClassification, DataCollatorForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm 

file_path = './dataset.csv'
dataset = pd.read_csv(file_path)

data = dataset.sample(frac=0.1, random_state=42)
data.reset_index(drop=True)
data = data.to_dict(orient="records")


# label2id = {"O": 0, "B-MENTAL_HEALTH_CONCERN": 1, "I-MENTAL_HEALTH_CONCERN": 2}

In [40]:
from transformers import LukeTokenizer, LukeForTokenClassification

# Initialize the tokenizer and model
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
model = LukeForTokenClassification.from_pretrained("studio-ousia/luke-base", num_labels=2)

label2id = {"O": 0, "CONCERN": 1}

def label_concern_phrases(user_input, extracted_concern, tokenizer):
    # Initialize character-level labels with "O"
    char_labels = [label2id["O"]] * len(user_input)
    
    # Find the start position of the extracted concern phrase in user_input
    start_idx = user_input.find(extracted_concern)
    if start_idx != -1:
        # Mark characters in the extracted concern as "CONCERN"
        for i in range(start_idx, start_idx + len(extracted_concern)):
            char_labels[i] = label2id["CONCERN"]
    
    # Tokenize the input with padding and truncation
    tokens = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True)
    tokenized_input = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

    # Map character-level labels to token-level labels
    labels = []
    char_idx = 0  # Character index in user_input
    for token in tokenized_input:
        if token.startswith("##"):  # Continuation of a split word, repeat last label
            labels.append(labels[-1])
        else:
            if char_idx < len(char_labels):
                labels.append(char_labels[char_idx])
            else:
                labels.append(label2id["O"])
            # Advance the character index by the token length (without "##" if present)
            char_idx += len(token.replace("##", ""))
    
    # Pad labels to match tokenized input length if needed
    labels += [label2id["O"]] * (tokens["input_ids"].shape[1] - len(labels))
    return tokens, labels

# Example usage
user_input = "Things have been tough, I keep not eating properly."
extracted_concern = "not eating properly"
tokens, labels = label_concern_phrases(user_input, extracted_concern, tokenizer)
print(labels)


Some weights of LukeForTokenClassification were not initialized from the model checkpoint at studio-ousia/luke-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]


In [41]:
from torch.utils.data import Dataset, DataLoader

class ConcernDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            user_input = self.data[idx]["User Input"]
            extracted_concern = self.data[idx]["Extracted Concern"]
            tokens, labels = label_concern_phrases(user_input, extracted_concern, self.tokenizer)

            # Convert tokens and labels to tensors
            return {
                "input_ids": tokens["input_ids"].squeeze(0),
                "attention_mask": tokens["attention_mask"].squeeze(0),
                "labels": torch.tensor(labels, dtype=torch.long)
            }
        except KeyError as e:
            print(f"KeyError at index {idx}: {e}")
            raise



In [42]:
train_size = int(0.7 * len(data))
val_size = int(0.15 * len(data))
test_size = int(0.15 * len(data))

# Calculate total size and sample dataset
total_size = train_size + val_size + test_size
reduced_data = data[:total_size]  # Slice the dataset to the reduced total size

# Create the ConcernDataset with reduced data
dataset = ConcernDataset(reduced_data, tokenizer)

# Split dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Initialize the data collator for padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Create DataLoaders for each split
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)

print(len(train_dataloader), len(val_dataloader), len(test_dataloader))


438 94 94


In [43]:
import torch.nn as nn
optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = nn.CrossEntropyLoss()
epochs = 5



In [44]:
# for batch in tqdm(train_dataloader):
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     labels = batch['labels']
    
#     # Print shapes
#     print(f"input_ids shape: {input_ids.shape}")
#     print(f"attention_mask shape: {attention_mask.shape}")
#     print(f"labels shape: {labels.shape}")
#     break

In [45]:
def train(model, train_dataloader, optimizer, epochs=3):
    model.train()
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = 0
        
        for batch in tqdm(train_dataloader):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # print(labels.shape)
            # Zero gradients before each step
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # print(outputs.logits.shape)
            # loss = outputs.loss
            logits = outputs.logits 
            loss = loss_fn(logits.view(-1,2), labels.view(-1))
            epoch_loss += loss.item()
            
            # Backward pass and optimization step
            loss.backward()
            optimizer.step()
            # scheduler.step()
        
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Average loss for Epoch {epoch + 1}: {avg_loss:.4f}")

# Start training
train(model, train_dataloader, optimizer)


Epoch 1/3


100%|██████████| 438/438 [05:15<00:00,  1.39it/s]


Average loss for Epoch 1: 0.0277
Epoch 2/3


100%|██████████| 438/438 [05:10<00:00,  1.41it/s]


Average loss for Epoch 2: 0.0042
Epoch 3/3


100%|██████████| 438/438 [05:08<00:00,  1.42it/s]

Average loss for Epoch 3: 0.0074





In [46]:
from transformers import LukeForTokenClassification, LukeTokenizer

# Define paths for saving
save_directory = "./saved_model_luke"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_model_luke\\tokenizer_config.json',
 './saved_model_luke\\special_tokens_map.json',
 './saved_model_luke\\vocab.json',
 './saved_model_luke\\merges.txt',
 './saved_model_luke\\entity_vocab.json',
 './saved_model_luke\\added_tokens.json')

In [47]:
model = LukeForTokenClassification.from_pretrained(save_directory)
tokenizer = LukeTokenizer.from_pretrained(save_directory)


model.eval()

def predict_concerns(text, tokenizer, model, label2id):
    # Tokenize the input text with padding and truncation
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # Perform prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Get the predicted label IDs
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    # Convert label IDs to label names
    id2label = {v: k for k, v in label2id.items()}
    predicted_labels = [id2label[label_id] for label_id in predictions]

    # Convert tokens to readable text
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())

    # Only display tokens labeled as concerns
    results = []
    for token, label in zip(tokens, predicted_labels):
        if label != "O":  # Show only tokens with 'CONCERN' labels
            results.append((token, label))
            print(f"{token}: {label}")

    return results


In [48]:
text = "Things have been tough, I keep not eating properly."
print(predict_concerns(text, tokenizer,model,label2id))

Ġnot: CONCERN
Ġeating: CONCERN
Ġproperly: CONCERN
[('Ġnot', 'CONCERN'), ('Ġeating', 'CONCERN'), ('Ġproperly', 'CONCERN')]


In [49]:
def predict_concerns_dataloader(test_dataloader, tokenizer, model, label2id):
    model.eval()  # Set the model to evaluation mode
    all_results = []  # To store results for all batches
    correct_predictions = 0  # To count correct predictions
    total_predictions = 0  # To count total predictions

    # Disable gradient calculation for inference
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            # Move batch to device
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            true_labels = batch['labels']

            # Perform prediction
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get the predicted label IDs
            predictions = torch.argmax(outputs.logits, dim=-1)
            # print(predictions)

            # Convert label IDs to label names
            id2label = {v: k for k, v in label2id.items()}
            # print("Shape of predictions:", predictions.shape)
            predicted_labels = predictions.cpu().numpy()  # Move to CPU and convert to numpy array
            # print(len(input_ids))
            # print(predicted_labels)
            for i in range(len(input_ids)):
                tokens = tokenizer.convert_ids_to_tokens(input_ids[i].squeeze())  # Get tokens for the current input
                true_label = true_labels[i].cpu().numpy()  # Assuming true_labels are also on the CPU
                # print(true_label)
                # print(predicted_labels[i])
                for token, pred_label_id, true_label_id in zip(tokens, predicted_labels[i], true_label):
                    if true_label_id != -100:  
                        total_predictions += 1
                        if pred_label_id == true_label_id:
                            correct_predictions += 1
                    
                    # Convert predicted label ID to label name
                    pred_label = id2label[pred_label_id]

                    # Only store tokens with 'CONCERN' labels
                    if pred_label != "O":
                        all_results.append((token, pred_label))
                    
            
    # Calculate accuracy
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f'Accuracy: {accuracy*100:.4f}')

    return all_results

# Example usage
all_concerns = predict_concerns_dataloader(test_dataloader, tokenizer, model, label2id)


100%|██████████| 94/94 [00:11<00:00,  8.28it/s]

Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy: 0.9989Accuracy


